From ab9e6d39a300a395461d2c970278c309d667d4d3 Mon Sep 17 00:00:00 2001 From: Jon Bratseth Date: Sun, 6 Nov 2022 17:18:09 +0100 Subject: Handle no traffic Never assume we need more traffic shift headroom than 1/maxTrafficShare when we have a max traffic share measurement. --- .../vespa/hosted/provision/autoscale/ClusterModel.java | 5 ++--- .../hosted/provision/autoscale/AutoscalingTest.java | 8 ++++---- .../hosted/provision/autoscale/ClusterModelTest.java | 18 +++++++++++------- .../yahoo/vespa/hosted/provision/autoscale/Loader.java | 7 +++++-- 4 files changed, 22 insertions(+), 16 deletions(-) diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java index 3025124b174..22a6a5812b2 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java @@ -225,15 +225,14 @@ public class ClusterModel { growthRateHeadroom = Math.min(growthRateHeadroom, 1 / queryFractionOfMax() + 0.1); // How much headroom is needed to handle sudden arrival of additional traffic due to another zone going down? - double maxTrafficShiftHeadroom = 10.0; // Cap to avoid extreme sizes from a current very small share double trafficShiftHeadroom; if (application.status().maxReadShare() == 0) // No traffic fraction data trafficShiftHeadroom = 2.0; // assume we currently get half of the global share of traffic else if (application.status().currentReadShare() == 0) - trafficShiftHeadroom = maxTrafficShiftHeadroom; + trafficShiftHeadroom = 1/application.status().maxReadShare(); else trafficShiftHeadroom = application.status().maxReadShare() / application.status().currentReadShare(); - trafficShiftHeadroom = Math.min(trafficShiftHeadroom, maxTrafficShiftHeadroom); + trafficShiftHeadroom = Math.min(trafficShiftHeadroom, 1/application.status().maxReadShare()); // Assumptions: 1) Write load is not organic so we should not grow to handle more. // (TODO: But allow applications to set their target write rate and size for that) diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java index 04cbdd2666d..25c5b772655 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java @@ -113,16 +113,16 @@ public class AutoscalingTest { @Test public void test_autoscaling_without_traffic() { - var min = new ClusterResources(1, 1, new NodeResources(2, 4, 10, 0.3)); - var now = new ClusterResources(4, 1, new NodeResources(2, 16, 10, 0.3)); - var max = new ClusterResources(4, 1, new NodeResources(3, 16, 50, 0.3)); + var min = new ClusterResources(1, 1, new NodeResources(0.5, 4, 10, 0.3)); + var now = new ClusterResources(4, 1, new NodeResources(8, 16, 10, 0.3)); + var max = new ClusterResources(4, 1, new NodeResources(16, 32, 50, 0.3)); var fixture = AutoscalingTester.fixture(min, now, max) .clusterType(ClusterSpec.Type.container) .awsProdSetup() .build(); var duration = fixture.loader().addMeasurements(new Load(0.04, 0.39, 0.01), 20); fixture.tester().clock().advance(duration.negated()); - fixture.loader().zeroTraffic(20); + fixture.loader().zeroTraffic(20, 1); fixture.tester().assertResources("Scaled down", 2, 1, 2, 16, 10, fixture.autoscale()); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModelTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModelTest.java index 0559a232065..f31ad191637 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModelTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModelTest.java @@ -41,25 +41,30 @@ public class ClusterModelTest { // No current traffic share: Ideal load is low but capped var model1 = clusterModel(new Status(0.0, 1.0), t -> t == 0 ? 10000.0 : 0.0, t -> 0.0); - assertEquals(0.10672097759674132, model1.idealLoad().cpu(), delta); + assertEquals(0.37067209775967414, model1.idealLoad().cpu(), delta); // Almost no current traffic share: Ideal load is low but capped var model2 = clusterModel(new Status(0.0001, 1.0), t -> t == 0 ? 10000.0 : 0.0, t -> 0.0); - assertEquals(0.10672097759674132, model2.idealLoad().cpu(), delta); + assertEquals(0.37067209775967414, model2.idealLoad().cpu(), delta); } @Test public void test_growth_headroom() { - // No current traffic: Ideal load is low but capped + // No traffic data: Ideal load assumes 2 regions var model1 = clusterModel(new Status(0.0, 0.0), t -> t == 0 ? 10000.0 : 0.0, t -> 0.0); assertEquals(0.2240325865580448, model1.idealLoad().cpu(), delta); - // Almost no current traffic: Ideal load is low but capped - var model2 = clusterModel(new Status(0.0001, 1.0), + // No traffic: Ideal load is higher since we now know there is only one zone + var model2 = clusterModel(new Status(0.0, 1.0), + t -> t == 0 ? 10000.0 : 0.0, t -> 0.0); + assertEquals(0.37067209775967414, model2.idealLoad().cpu(), delta); + + // Almost no current traffic: Similar number as above + var model3 = clusterModel(new Status(0.0001, 1.0), t -> t == 0 ? 10000.0 : 0.0001, t -> 0.0); - assertEquals(0.0326530612244898, model2.idealLoad().cpu(), delta); + assertEquals(0.32653061224489793, model3.idealLoad().cpu(), delta); } private ClusterModel clusterModelWithNoData() { @@ -72,7 +77,6 @@ public class ClusterModelTest { ClusterSpec clusterSpec = clusterSpec(); Cluster cluster = cluster(resources()); application = application.with(cluster); - return new ClusterModel(application.with(status), clusterSpec, cluster, clock, Duration.ofMinutes(10), timeseries(cluster,100, queryRate, writeRate, clock), diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Loader.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Loader.java index 39745b726a0..9158262b134 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Loader.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Loader.java @@ -26,9 +26,12 @@ public class Loader { } /** Assign measured zero traffic in the same way as the system will. */ - public Duration zeroTraffic(int measurements) { + public Duration zeroTraffic(int measurements, int prodRegions) { try (var lock = fixture.tester().nodeRepository().applications().lock(fixture.applicationId())) { - var statusWithZeroLoad = fixture.application().status().withCurrentReadShare(0).withMaxReadShare(1); + var statusWithZeroLoad = fixture.application().status() + .withCurrentReadShare(0) + // the line below from TrafficShareUpdater + .withMaxReadShare(prodRegions < 2 ? 1.0 : 1.0 / ( prodRegions - 1.0)); fixture.tester().nodeRepository().applications().put(fixture.application().with(statusWithZeroLoad), lock); } return addQueryRateMeasurements(measurements, (n) -> 0.0); -- cgit v1.2.3