From aa65cbd0e8cce8a5e02c1a58f374290ed8cf9c40 Mon Sep 17 00:00:00 2001 From: Jon Bratseth Date: Wed, 16 Aug 2023 20:48:38 +0200 Subject: Don't use BCPInfo to scale down --- .../vespa/hosted/provision/autoscale/ClusterModel.java | 3 ++- .../autoscale/AutoscalingUsingBcpGroupInfoTest.java | 17 ++++++++++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java index 40b0bd8d88b..f37c9e8e05d 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java @@ -280,7 +280,8 @@ public class ClusterModel { ( 1 - cpu.queryFraction()) * cpu.idealLoad() * (clusterSpec.type().isContainer() ? 1 : groupSize()); - double cpuAdjustment = neededTotalVcpPerGroup / currentClusterTotalVcpuPerGroup; + // Max 1: Only use bcp group info if it indicates that we need to scale *up* + double cpuAdjustment = Math.max(1.0, neededTotalVcpPerGroup / currentClusterTotalVcpuPerGroup); return ideal.withCpu(peakLoad().cpu() / cpuAdjustment); } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingUsingBcpGroupInfoTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingUsingBcpGroupInfoTest.java index 379dbb27d87..034f948d0c9 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingUsingBcpGroupInfoTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingUsingBcpGroupInfoTest.java @@ -173,7 +173,7 @@ public class AutoscalingUsingBcpGroupInfoTest { fixture.store(new BcpGroupInfo(100, 1.1, 0.45)); fixture.loader().addCpuMeasurements(0.7f, 10); fixture.tester().assertResources("No need for traffic shift headroom", - 2, 1, 2.0, 16.0, 40.8, + 3, 1, 4.0, 16.0, 40.8, fixture.autoscale()); } @@ -270,6 +270,21 @@ public class AutoscalingUsingBcpGroupInfoTest { fixture.autoscale()); } + @Test + public void test_autoscaling_containers_with_some_local_traffic() { + var fixture = DynamicProvisioningTester.fixture().clusterType(ClusterSpec.Type.container).awsProdSetup(true).build(); + + // Some local traffic + fixture.tester().clock().advance(Duration.ofDays(2)); + fixture.store(new BcpGroupInfo(200, 1.9, 500)); + Duration duration1 = fixture.loader().addCpuMeasurements(0.01f, 10); + fixture.tester().clock().advance(duration1.negated()); + fixture.loader().addQueryRateMeasurements(10, __ -> 10.0); + fixture.tester().assertResources("Not scaling up by group info since remote queries are much cheaper than local", + 2, 1, 2.0, 16.0, 40.8, + fixture.autoscale()); + } + /** Tests with varying BCP group info parameters. */ @Test public void test_autoscaling_metrics() { -- cgit v1.2.3 From 9d90ee8457d5ac5ff95eb89c609ed909885446cc Mon Sep 17 00:00:00 2001 From: Jon Bratseth Date: Wed, 16 Aug 2023 22:30:12 +0200 Subject: Adjust by ideal --- .../hosted/provision/autoscale/ClusterModel.java | 13 +++++--- .../AutoscalingUsingBcpGroupInfoTest.java | 38 +++++++++++----------- 2 files changed, 27 insertions(+), 24 deletions(-) diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java index f37c9e8e05d..fad7f49ab91 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java @@ -247,16 +247,21 @@ public class ClusterModel { */ public Load idealLoad() { var ideal = new Load(cpu.idealLoad(), memory.idealLoad(), disk.idealLoad()).divide(redundancyAdjustment()); + System.out.println("---------------------------------"); + System.out.println("Initial ideal: " + ideal); if ( !cluster.bcpGroupInfo().isEmpty() && cluster.bcpGroupInfo().queryRate() > 0) { // Since we have little local information, use information about query cost in other groups Load bcpGroupIdeal = adjustQueryDependentIdealLoadByBcpGroupInfo(ideal); + System.out.println("Bcp group ideal: " + bcpGroupIdeal); // Do a weighted sum of the ideal "vote" based on local and bcp group info. // This avoids any discontinuities with a near-zero local query rate. double localInformationWeight = Math.min(1, averageQueryRate().orElse(0) / Math.min(queryRateGivingFullConfidence, cluster.bcpGroupInfo().queryRate())); ideal = ideal.multiply(localInformationWeight).add(bcpGroupIdeal.multiply(1 - localInformationWeight)); + System.out.println("local information weight: " + localInformationWeight); } + System.out.println("Adjusted ideal: " + ideal); return ideal; } @@ -272,17 +277,15 @@ public class ClusterModel { private Load adjustQueryDependentIdealLoadByBcpGroupInfo(Load ideal) { double currentClusterTotalVcpuPerGroup = nodes.not().retired().first().get().resources().vcpu() * groupSize(); - double targetQueryRateToHandle = ( canRescaleWithinBcpDeadline() ? averageQueryRate().orElse(0) : cluster.bcpGroupInfo().queryRate() ) * cluster.bcpGroupInfo().growthRateHeadroom() * trafficShiftHeadroom(); - double neededTotalVcpPerGroup = cluster.bcpGroupInfo().cpuCostPerQuery() * targetQueryRateToHandle / groupCount() + + double neededTotalVcpuPerGroup = cluster.bcpGroupInfo().cpuCostPerQuery() * targetQueryRateToHandle / groupCount() + ( 1 - cpu.queryFraction()) * cpu.idealLoad() * (clusterSpec.type().isContainer() ? 1 : groupSize()); - // Max 1: Only use bcp group info if it indicates that we need to scale *up* - double cpuAdjustment = Math.max(1.0, neededTotalVcpPerGroup / currentClusterTotalVcpuPerGroup); - return ideal.withCpu(peakLoad().cpu() / cpuAdjustment); + double cpuAdjustment = Math.max(1.0, neededTotalVcpuPerGroup / currentClusterTotalVcpuPerGroup); + return ideal.withCpu(ideal.cpu() / cpuAdjustment); } private boolean hasScaledIn(Duration period) { diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingUsingBcpGroupInfoTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingUsingBcpGroupInfoTest.java index 034f948d0c9..be7bc3c44a8 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingUsingBcpGroupInfoTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingUsingBcpGroupInfoTest.java @@ -32,7 +32,7 @@ public class AutoscalingUsingBcpGroupInfoTest { fixture.store(new BcpGroupInfo(100, 1.1, 0.3)); fixture.loader().addCpuMeasurements(0.7f, 10); fixture.tester().assertResources("Scaling up cpu using bcp group cpu info", - 8, 1, 4.0, 7.4, 29.0, + 8, 1, 3.4, 7.4, 29.0, fixture.autoscale()); // Higher query rate @@ -40,7 +40,7 @@ public class AutoscalingUsingBcpGroupInfoTest { fixture.store(new BcpGroupInfo(200, 1.1, 0.3)); fixture.loader().addCpuMeasurements(0.7f, 10); fixture.tester().assertResources("Scaling up cpu using bcp group cpu info", - 8, 1, 8.0, 7.4, 29.0, + 8, 1, 6.8, 7.4, 29.0, fixture.autoscale()); // Higher headroom @@ -48,7 +48,7 @@ public class AutoscalingUsingBcpGroupInfoTest { fixture.store(new BcpGroupInfo(100, 1.3, 0.3)); fixture.loader().addCpuMeasurements(0.7f, 10); fixture.tester().assertResources("Scaling up cpu using bcp group cpu info", - 8, 1, 4.8, 7.4, 29.0, + 8, 1, 4.0, 7.4, 29.0, fixture.autoscale()); // Higher per query cost @@ -56,7 +56,7 @@ public class AutoscalingUsingBcpGroupInfoTest { fixture.store(new BcpGroupInfo(100, 1.1, 0.45)); fixture.loader().addCpuMeasurements(0.7f, 10); fixture.tester().assertResources("Scaling up cpu using bcp group cpu info", - 8, 1, 6.0, 7.4, 29.0, + 8, 1, 5.1, 7.4, 29.0, fixture.autoscale()); // Bcp elsewhere is 0 - use local only @@ -85,7 +85,7 @@ public class AutoscalingUsingBcpGroupInfoTest { fixture.store(new BcpGroupInfo(100, 1.1, 0.3)); fixture.loader().addCpuMeasurements(0.7f, 10); fixture.tester().assertResources("Scaling up cpu using bcp group cpu info", - 3, 3, 10.5, 43.2, 190.0, + 3, 3, 11.7, 43.2, 190.0, fixture.autoscale()); // Higher query rate @@ -93,7 +93,7 @@ public class AutoscalingUsingBcpGroupInfoTest { fixture.store(new BcpGroupInfo(200, 1.1, 0.3)); fixture.loader().addCpuMeasurements(0.7f, 10); fixture.tester().assertResources("Scaling up cpu using bcp group cpu info", - 3, 3, 20.9, 43.2, 190.0, + 3, 3, 23.1, 43.2, 190.0, fixture.autoscale()); // Higher headroom @@ -101,7 +101,7 @@ public class AutoscalingUsingBcpGroupInfoTest { fixture.store(new BcpGroupInfo(100, 1.3, 0.3)); fixture.loader().addCpuMeasurements(0.7f, 10); fixture.tester().assertResources("Scaling up cpu using bcp group cpu info", - 3, 3, 12.4, 43.2, 190.0, + 3, 3, 13.8, 43.2, 190.0, fixture.autoscale()); // Higher per query cost @@ -109,7 +109,7 @@ public class AutoscalingUsingBcpGroupInfoTest { fixture.store(new BcpGroupInfo(100, 1.1, 0.45)); fixture.loader().addCpuMeasurements(0.7f, 10); fixture.tester().assertResources("Scaling up cpu using bcp group cpu info", - 3, 3, 15.7, 43.2, 190.0, + 3, 3, 17.4, 43.2, 190.0, fixture.autoscale()); } @@ -127,7 +127,7 @@ public class AutoscalingUsingBcpGroupInfoTest { fixture.store(new BcpGroupInfo(100, 1.1, 0.3)); fixture.loader().addCpuMeasurements(0.7f, 10); fixture.tester().assertResources("Scaling up cpu using bcp group cpu info", - 8, 1, 4.0, 16.0, 40.8, + 4, 1, 8.0, 16.0, 40.8, fixture.autoscale()); // Higher query rate (mem and disk changes are due to being assigned larger hosts where we get less overhead share @@ -135,7 +135,7 @@ public class AutoscalingUsingBcpGroupInfoTest { fixture.store(new BcpGroupInfo(200, 1.1, 0.3)); fixture.loader().addCpuMeasurements(0.7f, 10); fixture.tester().assertResources("Scaling up cpu using bcp group cpu info", - 8, 1, 8.0, 16.0, 40.8, + 7, 1, 8.0, 16.0, 40.8, fixture.autoscale()); // Higher headroom @@ -143,7 +143,7 @@ public class AutoscalingUsingBcpGroupInfoTest { fixture.store(new BcpGroupInfo(100, 1.3, 0.3)); fixture.loader().addCpuMeasurements(0.7f, 10); fixture.tester().assertResources("Scaling up cpu using bcp group cpu info", - 5, 1, 8.0, 16.0, 40.8, + 8, 1, 4.0, 16.0, 40.8, fixture.autoscale()); // Higher per query cost @@ -151,7 +151,7 @@ public class AutoscalingUsingBcpGroupInfoTest { fixture.store(new BcpGroupInfo(100, 1.1, 0.45)); fixture.loader().addCpuMeasurements(0.7f, 10); fixture.tester().assertResources("Scaling up cpu using bcp group cpu info", - 6, 1, 8.0, 16.0, 40.8, + 10, 1, 4.0, 16.0, 40.8, fixture.autoscale()); } @@ -186,7 +186,7 @@ public class AutoscalingUsingBcpGroupInfoTest { fixture.store(new BcpGroupInfo(200, 1.3, 0.45)); fixture.loader().addCpuMeasurements(0.7f, 10); fixture.tester().assertResources("Scaling up cpu using bcp group cpu info", - 8, 1, 14.2, 7.4, 29.0, + 8, 1, 11.9, 7.4, 29.0, fixture.autoscale()); // Some local traffic @@ -196,7 +196,7 @@ public class AutoscalingUsingBcpGroupInfoTest { fixture.tester().clock().advance(duration1.negated()); fixture.loader().addQueryRateMeasurements(10, __ -> 10.0); fixture.tester().assertResources("Scaling up cpu using bcp group cpu info", - 8, 1, 6.9, 7.4, 29.0, + 8, 1, 6.8, 7.4, 29.0, fixture.autoscale()); // Enough local traffic to get half the votes @@ -206,7 +206,7 @@ public class AutoscalingUsingBcpGroupInfoTest { fixture.tester().clock().advance(duration2.negated()); fixture.loader().addQueryRateMeasurements(10, __ -> 50.0); fixture.tester().assertResources("Scaling up cpu using bcp group cpu info", - 8, 1, 2.9, 7.4, 29.0, + 8, 1, 3.0, 7.4, 29.0, fixture.autoscale()); // Mostly local @@ -276,12 +276,12 @@ public class AutoscalingUsingBcpGroupInfoTest { // Some local traffic fixture.tester().clock().advance(Duration.ofDays(2)); - fixture.store(new BcpGroupInfo(200, 1.9, 500)); - Duration duration1 = fixture.loader().addCpuMeasurements(0.01f, 10); + fixture.store(new BcpGroupInfo(200, 1.9, 0.01)); + Duration duration1 = fixture.loader().addCpuMeasurements(0.58f, 10); fixture.tester().clock().advance(duration1.negated()); fixture.loader().addQueryRateMeasurements(10, __ -> 10.0); - fixture.tester().assertResources("Not scaling up by group info since remote queries are much cheaper than local", - 2, 1, 2.0, 16.0, 40.8, + fixture.tester().assertResources("Not scaling down due to group info, even though it contains much evidence queries are cheap", + 3, 1, 4.0, 16.0, 40.8, fixture.autoscale()); } -- cgit v1.2.3 From 3f0ab6ff5310ca5bfc8d31dc0285582eb0584552 Mon Sep 17 00:00:00 2001 From: Valerij Fredriksen Date: Wed, 16 Aug 2023 23:38:15 +0200 Subject: Remove debug --- .../java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java index fad7f49ab91..414b0f00f89 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java @@ -247,8 +247,6 @@ public class ClusterModel { */ public Load idealLoad() { var ideal = new Load(cpu.idealLoad(), memory.idealLoad(), disk.idealLoad()).divide(redundancyAdjustment()); - System.out.println("---------------------------------"); - System.out.println("Initial ideal: " + ideal); if ( !cluster.bcpGroupInfo().isEmpty() && cluster.bcpGroupInfo().queryRate() > 0) { // Since we have little local information, use information about query cost in other groups Load bcpGroupIdeal = adjustQueryDependentIdealLoadByBcpGroupInfo(ideal); -- cgit v1.2.3 From 081bb95fd3f35bd97051c8738112774d8fab67b0 Mon Sep 17 00:00:00 2001 From: Valerij Fredriksen Date: Wed, 16 Aug 2023 23:38:45 +0200 Subject: Remove debug --- .../java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java | 3 --- 1 file changed, 3 deletions(-) diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java index 414b0f00f89..8976dd9ff08 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java @@ -251,15 +251,12 @@ public class ClusterModel { // Since we have little local information, use information about query cost in other groups Load bcpGroupIdeal = adjustQueryDependentIdealLoadByBcpGroupInfo(ideal); - System.out.println("Bcp group ideal: " + bcpGroupIdeal); // Do a weighted sum of the ideal "vote" based on local and bcp group info. // This avoids any discontinuities with a near-zero local query rate. double localInformationWeight = Math.min(1, averageQueryRate().orElse(0) / Math.min(queryRateGivingFullConfidence, cluster.bcpGroupInfo().queryRate())); ideal = ideal.multiply(localInformationWeight).add(bcpGroupIdeal.multiply(1 - localInformationWeight)); - System.out.println("local information weight: " + localInformationWeight); } - System.out.println("Adjusted ideal: " + ideal); return ideal; } -- cgit v1.2.3