From 0ba8b0001cee9ae1aad8fbdfac863a79da212d1c Mon Sep 17 00:00:00 2001 From: Jon Bratseth Date: Mon, 13 Feb 2023 12:23:55 +0100 Subject: Cold autoscaling WIP Core support for autoscaling using data from cluster deployments in other regions when there is little or no traffic in our own deployment. --- .../provision/applications/BcpGroupInfo.java | 66 ++++++ .../hosted/provision/applications/Cluster.java | 22 +- .../hosted/provision/autoscale/ClusterModel.java | 157 +++++++++----- .../vespa/hosted/provision/autoscale/Load.java | 4 + .../persistence/ApplicationSerializer.java | 22 ++ .../provision/autoscale/AutoscalingTester.java | 2 +- .../AutoscalingUsingBcpGroupInfoTest.java | 238 +++++++++++++++++++++ .../vespa/hosted/provision/autoscale/Fixture.java | 7 + .../vespa/hosted/provision/autoscale/Loader.java | 8 +- .../persistence/ApplicationSerializerTest.java | 4 + 10 files changed, 460 insertions(+), 70 deletions(-) create mode 100644 node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/BcpGroupInfo.java create mode 100644 node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingUsingBcpGroupInfoTest.java (limited to 'node-repository') diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/BcpGroupInfo.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/BcpGroupInfo.java new file mode 100644 index 00000000000..6b0ea8532be --- /dev/null +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/BcpGroupInfo.java @@ -0,0 +1,66 @@ +package com.yahoo.vespa.hosted.provision.applications; + +import java.util.Objects; + +/** + * When there are multiple deployments of an application in different regions, + * instances of the cluster across regions may form a "BCP group". + * By default the clusters in all production regions form such a group, but other arrangements + * may be specified in deployment.xml, see com.yahoo.config.application.api.Bcp. + * + * This contains metrics averaged over the other clusters in the group this belongs to, + * which is used to amend scaling decisions in this cluster when it has little traffic on its own. + * + * @author bratseth + */ +public class BcpGroupInfo { + + private static final BcpGroupInfo empty = new BcpGroupInfo(0, 0, 0); + + private final double queryRate; + private final double growthRateHeadroom; + private final double cpuCostPerQuery; + + public BcpGroupInfo(double queryRate, double growthRateHeadroom, double cpuCostPerQuery) { + this.queryRate = queryRate; + this.growthRateHeadroom = growthRateHeadroom; + this.cpuCostPerQuery = cpuCostPerQuery; + } + + /** Returns the average query rate (queries/second) of the other clusters in the group this belongs to. */ + public double queryRate() { return queryRate; } + + /** Returns the average growth rate headroom of the other clusters in the group this belongs to. */ + public double growthRateHeadroom() { return growthRateHeadroom; } + + /** Returns the average total cluster CPU cost per query of the other clusters in the group this belongs to. */ + public double cpuCostPerQuery() { return cpuCostPerQuery; } + + public boolean isEmpty() { + return queryRate == 0 && growthRateHeadroom == 0 && cpuCostPerQuery == 0; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if ( ! (o instanceof BcpGroupInfo other)) return false; + if ( other.queryRate != this.queryRate) return false; + if ( other.growthRateHeadroom != this.growthRateHeadroom) return false; + if ( other.cpuCostPerQuery != this.cpuCostPerQuery) return false; + return true; + } + + @Override + public int hashCode() { + return Objects.hash(queryRate, growthRateHeadroom, cpuCostPerQuery); + } + + @Override + public String toString() { + return "BCP group info: " + queryRate + " q/s, " + growthRateHeadroom + " q/s headroom, " + + cpuCostPerQuery + " CPU cost per q/s"; + } + + public static BcpGroupInfo empty() { return empty; } + +} diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/Cluster.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/Cluster.java index 8da6bd6937b..ea4944c2bd5 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/Cluster.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/Cluster.java @@ -33,6 +33,7 @@ public class Cluster { private final boolean required; private final Autoscaling suggested; private final Autoscaling target; + private final BcpGroupInfo bcpGroupInfo; /** The maxScalingEvents last scaling events of this, sorted by increasing time (newest last) */ private final List scalingEvents; @@ -45,6 +46,7 @@ public class Cluster { boolean required, Autoscaling suggested, Autoscaling target, + BcpGroupInfo bcpGroupInfo, List scalingEvents) { this.id = Objects.requireNonNull(id); this.exclusive = exclusive; @@ -58,6 +60,7 @@ public class Cluster { this.target = Autoscaling.empty(); else this.target = target; + this.bcpGroupInfo = Objects.requireNonNull(bcpGroupInfo); this.scalingEvents = List.copyOf(scalingEvents); } @@ -77,7 +80,7 @@ public class Cluster { /** * Returns whether the resources of this cluster are required to be within the specified min and max. - * Otherwise they may be adjusted by capacity policies. + * Otherwise, they may be adjusted by capacity policies. */ public boolean required() { return required; } @@ -102,6 +105,9 @@ public class Cluster { return true; } + /** Returns info about the BCP group of clusters this belongs to. */ + public BcpGroupInfo bcpGroupInfo() { return bcpGroupInfo; } + /** Returns the recent scaling events in this cluster */ public List scalingEvents() { return scalingEvents; } @@ -113,15 +119,19 @@ public class Cluster { public Cluster withConfiguration(boolean exclusive, Capacity capacity) { return new Cluster(id, exclusive, capacity.minResources(), capacity.maxResources(), capacity.groupSize(), capacity.isRequired(), - suggested, target, scalingEvents); + suggested, target, bcpGroupInfo, scalingEvents); } public Cluster withSuggested(Autoscaling suggested) { - return new Cluster(id, exclusive, min, max, groupSize, required, suggested, target, scalingEvents); + return new Cluster(id, exclusive, min, max, groupSize, required, suggested, target, bcpGroupInfo, scalingEvents); } public Cluster withTarget(Autoscaling target) { - return new Cluster(id, exclusive, min, max, groupSize, required, suggested, target, scalingEvents); + return new Cluster(id, exclusive, min, max, groupSize, required, suggested, target, bcpGroupInfo, scalingEvents); + } + + public Cluster with(BcpGroupInfo bcpGroupInfo) { + return new Cluster(id, exclusive, min, max, groupSize, required, suggested, target, bcpGroupInfo, scalingEvents); } /** Add or update (based on "at" time) a scaling event */ @@ -135,7 +145,7 @@ public class Cluster { scalingEvents.add(scalingEvent); prune(scalingEvents); - return new Cluster(id, exclusive, min, max, groupSize, required, suggested, target, scalingEvents); + return new Cluster(id, exclusive, min, max, groupSize, required, suggested, target, bcpGroupInfo, scalingEvents); } @Override @@ -167,7 +177,7 @@ public class Cluster { public static Cluster create(ClusterSpec.Id id, boolean exclusive, Capacity requested) { return new Cluster(id, exclusive, requested.minResources(), requested.maxResources(), requested.groupSize(), requested.isRequired(), - Autoscaling.empty(), Autoscaling.empty(), List.of()); + Autoscaling.empty(), Autoscaling.empty(), BcpGroupInfo.empty(), List.of()); } /** The predicted time it will take to rescale this cluster. */ diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java index da74ad0b63b..4edcdbd3fa5 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java @@ -30,6 +30,9 @@ public class ClusterModel { /** Containers typically use more cpu right after generation change, so discard those metrics */ public static final Duration warmupDuration = Duration.ofMinutes(7); + /** If we have less than this query rate, we cannot be fully confident in our load data, which influences some decisions. */ + public static final double queryRateGivingFullConfidence = 100.0; + static final double idealQueryCpuLoad = 0.8; static final double idealWriteCpuLoad = 0.95; @@ -48,8 +51,13 @@ public class ClusterModel { private final Application application; private final ClusterSpec clusterSpec; private final Cluster cluster; - /** The current nodes of this cluster, or empty if this models a new cluster not yet deployed */ + + /** + * The current active nodes of this cluster, including retired, + * or empty if this models a new cluster not yet deployed. + */ private final NodeList nodes; + private final Clock clock; private final Duration scalingDuration; private final ClusterTimeseries clusterTimeseries; @@ -118,6 +126,14 @@ public class ClusterModel { return adjustment; } + public OptionalDouble cpuCostPerQuery() { + if (averageQueryRate().isEmpty()) return OptionalDouble.empty(); + // TODO: Query rate should generally be sampled at the time where we see the peak resource usage + int fanOut = clusterSpec.type().isContainer() ? 1 : groupSize(); + return OptionalDouble.of(peakLoad().cpu() * queryCpuFraction() * fanOut * nodes.not().retired().first().get().resources().vcpu() + / averageQueryRate().getAsDouble() / groupCount()); + } + public boolean isStable(NodeRepository nodeRepository) { // An autoscaling decision was recently made if (hasScaledIn(Duration.ofMinutes(5))) @@ -143,59 +159,12 @@ public class ClusterModel { return true; } - private boolean hasScaledIn(Duration period) { - return cluster.lastScalingEvent().map(event -> event.at()).orElse(Instant.MIN) - .isAfter(clock.instant().minus(period)); - } - /** Returns the predicted duration of a rescaling of this cluster */ public Duration scalingDuration() { return scalingDuration; } - public ClusterNodesTimeseries nodeTimeseries() { return nodeTimeseries; } - - public ClusterTimeseries clusterTimeseries() { return clusterTimeseries; } - - /** - * Returns the predicted max query growth rate per minute as a fraction of the average traffic - * in the scaling window. - */ - public double maxQueryGrowthRate() { - if (maxQueryGrowthRate != null) return maxQueryGrowthRate; - return maxQueryGrowthRate = clusterTimeseries().maxQueryGrowthRate(scalingDuration(), clock); - } - - /** Returns the average query rate in the scaling window as a fraction of the max observed query rate. */ - public double queryFractionOfMax() { - if (queryFractionOfMax != null) return queryFractionOfMax; - return queryFractionOfMax = clusterTimeseries().queryFractionOfMax(scalingDuration(), clock); - } - - /** Returns the average query rate in the scaling window. */ - public OptionalDouble averageQueryRate() { - if (averageQueryRate != null) return averageQueryRate; - return averageQueryRate = clusterTimeseries().queryRate(scalingDuration(), clock); - } - /** Returns the average of the peak load measurement in each dimension, from each node. */ - public Load peakLoad() { return nodeTimeseries().peakLoad(); } - - /** The number of nodes this cluster has, or will have if not deployed yet. */ - // TODO: Make this the deployed, not current count - public int nodeCount() { - if ( ! nodes.isEmpty()) return (int)nodes.stream().count(); - return cluster.minResources().nodes(); - } - - /** The number of groups this cluster has, or will have if not deployed yet. */ - // TODO: Make this the deployed, not current count - public int groupCount() { - if ( ! nodes.isEmpty()) return (int)nodes.stream().mapToInt(node -> node.allocation().get().membership().cluster().group().get().index()).distinct().count(); - return cluster.minResources().groups(); - } - - public int groupSize() { - // ceil: If the division does not produce a whole number we assume some node is missing - return (int)Math.ceil((double)nodeCount() / groupCount()); + public Load peakLoad() { + return nodeTimeseries().peakLoad(); } /** Returns the relative load adjustment accounting for redundancy in this. */ @@ -235,15 +204,88 @@ public class ClusterModel { * if one of the nodes go down. */ public Load idealLoad() { - return new Load(idealCpuLoad(), idealMemoryLoad(), idealDiskLoad()).divide(redundancyAdjustment()); + var ideal = new Load(idealCpuLoad(), idealMemoryLoad(), idealDiskLoad()).divide(redundancyAdjustment()); + if (! cluster.bcpGroupInfo().isEmpty()) { + // Do a weighted sum of the ideal "vote" based on local and bcp group info. + // This avoids any discontinuities with a near-zero local query rate. + double localInformationWeight = Math.min(1, averageQueryRate().orElse(0) / + Math.min(queryRateGivingFullConfidence, cluster.bcpGroupInfo().queryRate())); + Load bcpGroupIdeal = adjustQueryDependentIdealLoadByBcpGroupInfo(ideal); + ideal = ideal.multiply(localInformationWeight).add(bcpGroupIdeal.multiply(1 - localInformationWeight)); + } + return ideal; } - public int nodesAdjustedForRedundancy(int nodes, int groups) { + /** Returns the instant this model was created. */ + public Instant at() { return at;} + + private Load adjustQueryDependentIdealLoadByBcpGroupInfo(Load ideal) { + double currentClusterTotalVcpuPerGroup = nodes.not().retired().first().get().resources().vcpu() * groupSize(); + + double targetQueryRateToHandle = cluster.bcpGroupInfo().queryRate() * cluster.bcpGroupInfo().growthRateHeadroom() * trafficShiftHeadroom(); + double neededTotalVcpPerGroup = cluster.bcpGroupInfo().cpuCostPerQuery() * targetQueryRateToHandle / groupCount() + + ( 1 - queryCpuFraction()) * idealCpuLoad() * + (clusterSpec.type().isContainer() ? 1 : groupSize()); + + double cpuAdjustment = neededTotalVcpPerGroup / currentClusterTotalVcpuPerGroup; + return ideal.withCpu(peakLoad().cpu() / cpuAdjustment); + } + + private boolean hasScaledIn(Duration period) { + return cluster.lastScalingEvent().map(event -> event.at()).orElse(Instant.MIN) + .isAfter(clock.instant().minus(period)); + } + + private ClusterNodesTimeseries nodeTimeseries() { return nodeTimeseries; } + + private ClusterTimeseries clusterTimeseries() { return clusterTimeseries; } + + /** + * Returns the predicted max query growth rate per minute as a fraction of the average traffic + * in the scaling window. + */ + private double maxQueryGrowthRate() { + if (maxQueryGrowthRate != null) return maxQueryGrowthRate; + return maxQueryGrowthRate = clusterTimeseries().maxQueryGrowthRate(scalingDuration(), clock); + } + + /** Returns the average query rate in the scaling window as a fraction of the max observed query rate. */ + private double queryFractionOfMax() { + if (queryFractionOfMax != null) return queryFractionOfMax; + return queryFractionOfMax = clusterTimeseries().queryFractionOfMax(scalingDuration(), clock); + } + + /** Returns the average query rate in the scaling window. */ + private OptionalDouble averageQueryRate() { + if (averageQueryRate != null) return averageQueryRate; + return averageQueryRate = clusterTimeseries().queryRate(scalingDuration(), clock); + } + + /** The number of nodes this cluster has, or will have if not deployed yet. */ + // TODO: Make this the deployed, not current count + private int nodeCount() { + if ( ! nodes.isEmpty()) return (int)nodes.stream().count(); + return cluster.minResources().nodes(); + } + + /** The number of groups this cluster has, or will have if not deployed yet. */ + // TODO: Make this the deployed, not current count + private int groupCount() { + if ( ! nodes.isEmpty()) return (int)nodes.stream().mapToInt(node -> node.allocation().get().membership().cluster().group().get().index()).distinct().count(); + return cluster.minResources().groups(); + } + + private int groupSize() { + // ceil: If the division does not produce a whole number we assume some node is missing + return (int)Math.ceil((double)nodeCount() / groupCount()); + } + + private int nodesAdjustedForRedundancy(int nodes, int groups) { int groupSize = (int)Math.ceil((double)nodes / groups); return nodes > 1 ? (groups == 1 ? nodes - 1 : nodes - groupSize) : nodes; } - public int groupsAdjustedForRedundancy(int nodes, int groups) { + private int groupsAdjustedForRedundancy(int nodes, int groups) { return nodes > 1 ? (groups == 1 ? 1 : groups - 1) : groups; } @@ -258,9 +300,6 @@ public class ClusterModel { (1 - queryCpuFraction) * idealWriteCpuLoad; } - /** Returns the instant this model was created. */ - public Instant at() { return at;} - /** Returns the headroom for growth during organic traffic growth as a multiple of current resources. */ private double growthRateHeadroom() { if ( ! zone.environment().isProduction()) return 1; @@ -280,7 +319,7 @@ public class ClusterModel { if ( ! zone.environment().isProduction()) return 1; double trafficShiftHeadroom; if (application.status().maxReadShare() == 0) // No traffic fraction data - trafficShiftHeadroom = 2.0; // assume we currently get half of the global share of traffic + trafficShiftHeadroom = 2.0; // assume we currently get half of the max possible share of traffic else if (application.status().currentReadShare() == 0) trafficShiftHeadroom = 1/application.status().maxReadShare(); else @@ -294,11 +333,11 @@ public class ClusterModel { * with high confidence to avoid large adjustments caused by random noise due to low traffic numbers. */ private double adjustByConfidence(double headroom) { - return ( (headroom -1 ) * Math.min(1, averageQueryRate().orElse(0) / 100.0) ) + 1; + return ( (headroom -1 ) * Math.min(1, averageQueryRate().orElse(0) / queryRateGivingFullConfidence) ) + 1; } /** The estimated fraction of cpu usage which goes to processing queries vs. writes */ - public double queryCpuFraction() { + private double queryCpuFraction() { OptionalDouble writeRate = clusterTimeseries().writeRate(scalingDuration(), clock); if (averageQueryRate().orElse(0) == 0 && writeRate.orElse(0) == 0) return queryCpuFraction(0.5); return queryCpuFraction(averageQueryRate().orElse(0) / (averageQueryRate().orElse(0) + writeRate.orElse(0))); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Load.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Load.java index 6ab5ff731d3..a2fa6e63922 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Load.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Load.java @@ -30,6 +30,10 @@ public class Load { public double memory() { return memory; } public double disk() { return disk; } + public Load withCpu(double cpu) { return new Load(cpu, memory, disk); } + public Load withMemory(double memory) { return new Load(cpu, memory, disk); } + public Load withDisk(double disk) { return new Load(cpu, memory, disk); } + public Load add(Load other) { return join(other, (a, b) -> a + b); } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/ApplicationSerializer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/ApplicationSerializer.java index 469cabc4ee4..1b73dee8b6c 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/ApplicationSerializer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/ApplicationSerializer.java @@ -12,6 +12,7 @@ import com.yahoo.slime.Slime; import com.yahoo.slime.SlimeUtils; import com.yahoo.vespa.hosted.provision.applications.Application; import com.yahoo.vespa.hosted.provision.applications.Cluster; +import com.yahoo.vespa.hosted.provision.applications.BcpGroupInfo; import com.yahoo.vespa.hosted.provision.applications.ScalingEvent; import com.yahoo.vespa.hosted.provision.applications.Status; import com.yahoo.vespa.hosted.provision.autoscale.Autoscaling; @@ -53,6 +54,10 @@ public class ApplicationSerializer { private static final String groupSizeKey = "groupSize"; private static final String requiredKey = "required"; private static final String suggestedKey = "suggested"; + private static final String bcpGroupInfoKey = "bcpGroupInfo"; + private static final String queryRateKey = "queryRateKey"; + private static final String growthRateHeadroomKey = "growthRateHeadroomKey"; + private static final String cpuCostPerQueryKey = "cpuCostPerQueryKey"; private static final String resourcesKey = "resources"; private static final String targetKey = "target"; private static final String nodesKey = "nodes"; @@ -129,6 +134,8 @@ public class ApplicationSerializer { clusterObject.setBool(requiredKey, cluster.required()); toSlime(cluster.suggested(), clusterObject.setObject(suggestedKey)); toSlime(cluster.target(), clusterObject.setObject(targetKey)); + if (! cluster.bcpGroupInfo().isEmpty()) + toSlime(cluster.bcpGroupInfo(), clusterObject.setObject(bcpGroupInfoKey)); scalingEventsToSlime(cluster.scalingEvents(), clusterObject.setArray(scalingEventsKey)); } @@ -141,6 +148,7 @@ public class ApplicationSerializer { clusterObject.field(requiredKey).asBool(), autoscalingFromSlime(clusterObject.field(suggestedKey), clusterObject.field("nonExisting")), autoscalingFromSlime(clusterObject.field(targetKey), clusterObject.field(autoscalingStatusObjectKey)), + bcpGroupInfoFromSlime(clusterObject.field(bcpGroupInfoKey)), scalingEventsFromSlime(clusterObject.field(scalingEventsKey))); } @@ -222,6 +230,20 @@ public class ApplicationSerializer { loadFromSlime(autoscalingObject.field(idealKey))); } + private static void toSlime(BcpGroupInfo bcpGroupInfo, Cursor bcpGroupInfoObject) { + if (bcpGroupInfo.isEmpty()) return; + bcpGroupInfoObject.setDouble(queryRateKey, bcpGroupInfo.queryRate()); + bcpGroupInfoObject.setDouble(growthRateHeadroomKey, bcpGroupInfo.growthRateHeadroom()); + bcpGroupInfoObject.setDouble(cpuCostPerQueryKey, bcpGroupInfo.cpuCostPerQuery()); + } + + private static BcpGroupInfo bcpGroupInfoFromSlime(Inspector bcpGroupInfoObject) { + if ( ! bcpGroupInfoObject.valid()) return BcpGroupInfo.empty(); + return new BcpGroupInfo(bcpGroupInfoObject.field(queryRateKey).asDouble(), + bcpGroupInfoObject.field(growthRateHeadroomKey).asDouble(), + bcpGroupInfoObject.field(cpuCostPerQueryKey).asDouble()); + } + private static void scalingEventsToSlime(List scalingEvents, Cursor eventArray) { scalingEvents.forEach(event -> toSlime(event, eventArray.addObject())); } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java index 05d0822758d..19c6ce16674 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java @@ -3,7 +3,6 @@ package com.yahoo.vespa.hosted.provision.autoscale; import com.yahoo.config.provision.ApplicationId; import com.yahoo.config.provision.Capacity; -import com.yahoo.config.provision.Cloud; import com.yahoo.config.provision.ClusterResources; import com.yahoo.config.provision.ClusterSpec; import com.yahoo.config.provision.Flavor; @@ -135,6 +134,7 @@ class AutoscalingTester { cluster.required(), cluster.suggested(), cluster.target(), + cluster.bcpGroupInfo(), List.of()); // Remove scaling events cluster = cluster.with(ScalingEvent.create(cluster.minResources(), cluster.minResources(), 0, diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingUsingBcpGroupInfoTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingUsingBcpGroupInfoTest.java new file mode 100644 index 00000000000..0bd94872557 --- /dev/null +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingUsingBcpGroupInfoTest.java @@ -0,0 +1,238 @@ +package com.yahoo.vespa.hosted.provision.autoscale; + +import com.yahoo.config.provision.Capacity; +import com.yahoo.config.provision.ClusterResources; +import com.yahoo.config.provision.ClusterSpec; +import com.yahoo.config.provision.NodeResources; +import com.yahoo.vespa.hosted.provision.applications.BcpGroupInfo; +import org.junit.Test; + +import java.time.Duration; +import java.util.Optional; + +/** + * Tests autoscaling using information from the BCP group this cluster deployment + * is part of to supplement local data when the local deployment lacks sufficient traffic. + * + * @author bratseth + */ +public class AutoscalingUsingBcpGroupInfoTest { + + /** Tests with varying BCP group info parameters. */ + @Test + public void test_autoscaling_single_content_group() { + var fixture = AutoscalingTester.fixture().awsProdSetup(true).build(); + + fixture.tester().clock().advance(Duration.ofDays(2)); + fixture.store(new BcpGroupInfo(100, 1.1, 0.3)); + fixture.loader().addCpuMeasurements(0.7f, 10); + fixture.tester().assertResources("Scaling up cpu using bcp group cpu info", + 8, 1, 4.0, 7.6, 37.8, + fixture.autoscale()); + + // Higher query rate (mem and disk changes are due to being assigned larger hosts where we get less overhead share + fixture.tester().clock().advance(Duration.ofDays(2)); + fixture.store(new BcpGroupInfo(200, 1.1, 0.3)); + fixture.loader().addCpuMeasurements(0.7f, 10); + fixture.tester().assertResources("Scaling up cpu using bcp group cpu info", + 8, 1, 8.0, 7.4, 32.8, + fixture.autoscale()); + + // Higher headroom + fixture.tester().clock().advance(Duration.ofDays(2)); + fixture.store(new BcpGroupInfo(100, 1.3, 0.3)); + fixture.loader().addCpuMeasurements(0.7f, 10); + fixture.tester().assertResources("Scaling up cpu using bcp group cpu info", + 9, 1, 4.2, 6.6, 33.1, + fixture.autoscale()); + + // Higher per query cost + fixture.tester().clock().advance(Duration.ofDays(2)); + fixture.store(new BcpGroupInfo(100, 1.1, 0.45)); + fixture.loader().addCpuMeasurements(0.7f, 10); + fixture.tester().assertResources("Scaling up cpu using bcp group cpu info", + 9, 1, 5.4, 6.6, 33.1, + fixture.autoscale()); + } + + /** Tests with varying BCP group info parameters. */ + @Test + public void test_autoscaling_multiple_content_groups() { + var min = new ClusterResources(3, 3, + new NodeResources(1, 4, 10, 1, NodeResources.DiskSpeed.any)); + var max = new ClusterResources(21, 3, + new NodeResources(100, 1000, 1000, 1, NodeResources.DiskSpeed.any)); + var fixture = AutoscalingTester.fixture() + .awsProdSetup(true) + .initialResources(Optional.of(new ClusterResources(9, 3, new NodeResources(2, 16, 75, 1)))) + .capacity(Capacity.from(min, max)) + .build(); + + fixture.tester().clock().advance(Duration.ofDays(2)); + fixture.store(new BcpGroupInfo(100, 1.1, 0.3)); + fixture.loader().addCpuMeasurements(0.7f, 10); + fixture.tester().assertResources("Scaling up cpu using bcp group cpu info", + 3, 3, 10.5, 42.3, 187.0, + fixture.autoscale()); + + // Higher query rate (mem and disk changes are due to being assigned larger hosts where we get less overhead share + fixture.tester().clock().advance(Duration.ofDays(2)); + fixture.store(new BcpGroupInfo(200, 1.1, 0.3)); + fixture.loader().addCpuMeasurements(0.7f, 10); + fixture.tester().assertResources("Scaling up cpu using bcp group cpu info", + 3, 3, 20.9, 42.3, 178.0, + fixture.autoscale()); + + // Higher headroom + fixture.tester().clock().advance(Duration.ofDays(2)); + fixture.store(new BcpGroupInfo(100, 1.3, 0.3)); + fixture.loader().addCpuMeasurements(0.7f, 10); + fixture.tester().assertResources("Scaling up cpu using bcp group cpu info", + 3, 3, 12.4, 42.3, 187.0, + fixture.autoscale()); + + // Higher per query cost + fixture.tester().clock().advance(Duration.ofDays(2)); + fixture.store(new BcpGroupInfo(100, 1.1, 0.45)); + fixture.loader().addCpuMeasurements(0.7f, 10); + fixture.tester().assertResources("Scaling up cpu using bcp group cpu info", + 3, 3, 15.7, 42.3, 187.0, + fixture.autoscale()); + } + + /** + * Tests with varying BCP group info parameters for containers. + * Differences from content + * - No host sharing. + * - Memory and disk is independent of cluster size. + */ + @Test + public void test_autoscaling_container() { + var fixture = AutoscalingTester.fixture().clusterType(ClusterSpec.Type.container).awsProdSetup(true).build(); + + fixture.tester().clock().advance(Duration.ofDays(2)); + fixture.store(new BcpGroupInfo(100, 1.1, 0.3)); + fixture.loader().addCpuMeasurements(0.7f, 10); + fixture.tester().assertResources("Scaling up cpu using bcp group cpu info", + 8, 1, 4.0, 16.0, 40.8, + fixture.autoscale()); + + // Higher query rate (mem and disk changes are due to being assigned larger hosts where we get less overhead share + fixture.tester().clock().advance(Duration.ofDays(2)); + fixture.store(new BcpGroupInfo(200, 1.1, 0.3)); + fixture.loader().addCpuMeasurements(0.7f, 10); + fixture.tester().assertResources("Scaling up cpu using bcp group cpu info", + 8, 1, 8.0, 16.0, 40.8, + fixture.autoscale()); + + // Higher headroom + fixture.tester().clock().advance(Duration.ofDays(2)); + fixture.store(new BcpGroupInfo(100, 1.3, 0.3)); + fixture.loader().addCpuMeasurements(0.7f, 10); + fixture.tester().assertResources("Scaling up cpu using bcp group cpu info", + 5, 1, 8.0, 16.0, 40.8, + fixture.autoscale()); + + // Higher per query cost + fixture.tester().clock().advance(Duration.ofDays(2)); + fixture.store(new BcpGroupInfo(100, 1.1, 0.45)); + fixture.loader().addCpuMeasurements(0.7f, 10); + fixture.tester().assertResources("Scaling up cpu using bcp group cpu info", + 6, 1, 8.0, 16.0, 40.8, + fixture.autoscale()); + } + + @Test + public void test_autoscaling_single_content_group_with_some_local_traffic() { + var fixture = AutoscalingTester.fixture().awsProdSetup(true).build(); + + // Baseline: No local traffic, group traffic indicates much higher cpu usage than local + fixture.tester().clock().advance(Duration.ofDays(2)); + fixture.store(new BcpGroupInfo(200, 1.3, 0.45)); + fixture.loader().addCpuMeasurements(0.7f, 10); + fixture.tester().assertResources("Scaling up cpu using bcp group cpu info", + 8, 1, 14.2, 7.4, 32.8, + fixture.autoscale()); + + // Some local traffic + fixture.tester().clock().advance(Duration.ofDays(2)); + fixture.store(new BcpGroupInfo(200, 1.3, 0.45)); + Duration duration1 = fixture.loader().addCpuMeasurements(0.7f, 10); + fixture.tester().clock().advance(duration1.negated()); + fixture.loader().addQueryRateMeasurements(10, __ -> 10.0); + fixture.tester().assertResources("Scaling up cpu using bcp group cpu info", + 8, 1, 6.9, 7.6, 37.8, + fixture.autoscale()); + + // Enough local traffic to get half the votes + fixture.tester().clock().advance(Duration.ofDays(2)); + fixture.store(new BcpGroupInfo(200, 1.3, 0.45)); + Duration duration2 = fixture.loader().addCpuMeasurements(0.7f, 10); + fixture.tester().clock().advance(duration2.negated()); + fixture.loader().addQueryRateMeasurements(10, __ -> 50.0); + fixture.tester().assertResources("Scaling up cpu using bcp group cpu info", + 7, 1, 3.5, 8.9, 55.5, + fixture.autoscale()); + + // Mostly local + fixture.tester().clock().advance(Duration.ofDays(2)); + fixture.store(new BcpGroupInfo(200, 1.3, 0.45)); + Duration duration3 = fixture.loader().addCpuMeasurements(0.7f, 10); + fixture.tester().clock().advance(duration3.negated()); + fixture.loader().addQueryRateMeasurements(10, __ -> 90.0); + fixture.tester().assertResources("Scaling up cpu using bcp group cpu info", + 7, 1, 2.7, 8.9, 55.5, + fixture.autoscale()); + + // Local only + fixture.tester().clock().advance(Duration.ofDays(2)); + fixture.store(new BcpGroupInfo(200, 1.3, 0.45)); + Duration duration4 = fixture.loader().addCpuMeasurements(0.7f, 10); + fixture.tester().clock().advance(duration4.negated()); + fixture.loader().addQueryRateMeasurements(10, __ -> 100.0); + fixture.tester().assertResources("Scaling up cpu using bcp group cpu info", + 7, 1, 2.6, 8.9, 55.5, + fixture.autoscale()); + + // No group info, should be the same as the above + fixture.tester().clock().advance(Duration.ofDays(2)); + fixture.store(BcpGroupInfo.empty()); + Duration duration5 = fixture.loader().addCpuMeasurements(0.7f, 10); + fixture.tester().clock().advance(duration5.negated()); + fixture.loader().addQueryRateMeasurements(10, __ -> 100.0); + fixture.tester().assertResources("Scaling up cpu using bcp group cpu info", + 7, 1, 2.6, 8.9, 55.5, + fixture.autoscale()); + + // 40 query rate, no group info (for reference to the below) + fixture.tester().clock().advance(Duration.ofDays(2)); + fixture.store(BcpGroupInfo.empty()); + Duration duration6 = fixture.loader().addCpuMeasurements(0.7f, 10); + fixture.tester().clock().advance(duration6.negated()); + fixture.loader().addQueryRateMeasurements(10, __ -> 40.0); + fixture.tester().assertResources("Scaling up cpu using bcp group cpu info", + 6, 1, 2.2, 10.6, 66.5, + fixture.autoscale()); + + // Local query rate is too low but global is even lower so disregard it, giving the same as above + fixture.tester().clock().advance(Duration.ofDays(2)); + fixture.store(new BcpGroupInfo(200/40.0, 1.3, 0.45*40.0)); + Duration duration7 = fixture.loader().addCpuMeasurements(0.7f, 10); + fixture.tester().clock().advance(duration7.negated()); + fixture.loader().addQueryRateMeasurements(10, __ -> 40.0); + fixture.tester().assertResources("Scaling up cpu using bcp group cpu info", + 6, 1, 2.2, 10.6, 66.5, + fixture.autoscale()); + + // Local query rate is too low to be fully confident, and so is global but as it is slightly larger, incorporate it slightly + fixture.tester().clock().advance(Duration.ofDays(2)); + fixture.store(new BcpGroupInfo(200/4.0, 1.3, 0.45*4.0)); + Duration duration8 = fixture.loader().addCpuMeasurements(0.7f, 10); + fixture.tester().clock().advance(duration8.negated()); + fixture.loader().addQueryRateMeasurements(10, __ -> 40.0); + fixture.tester().assertResources("Scaling up cpu using bcp group cpu info", + 7, 1, 2.2, 8.9, 55.5, + fixture.autoscale()); + } + +} diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Fixture.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Fixture.java index 1e1e00a10db..5caf50a4e83 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Fixture.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Fixture.java @@ -21,6 +21,7 @@ import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeList; import com.yahoo.vespa.hosted.provision.applications.Application; import com.yahoo.vespa.hosted.provision.applications.Cluster; +import com.yahoo.vespa.hosted.provision.applications.BcpGroupInfo; import com.yahoo.vespa.hosted.provision.autoscale.awsnodes.AwsHostResourcesCalculatorImpl; import com.yahoo.vespa.hosted.provision.autoscale.awsnodes.AwsNodeTypes; import com.yahoo.vespa.hosted.provision.provisioning.HostResourcesCalculator; @@ -132,6 +133,12 @@ public class Fixture { tester.nodeRepository().applications().put(application, tester.nodeRepository().applications().lock(applicationId)); } + public void store(BcpGroupInfo bcpGroupInfo) { + var application = application(); + application = application.with(application.cluster(clusterId()).get().with(bcpGroupInfo)); + tester.nodeRepository().applications().put(application, tester.nodeRepository().applications().lock(applicationId)); + } + public static class Builder { ApplicationId application = AutoscalingTester.applicationId("application1"); diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Loader.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Loader.java index 10c8c7434b1..a104f0b1bc8 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Loader.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Loader.java @@ -79,10 +79,10 @@ public class Loader { return Duration.between(initialTime, fixture.tester().clock().instant()); } - public void applyCpuLoad(double cpuLoad, int measurements) { - addCpuMeasurements((float)cpuLoad, measurements); - fixture.tester().clock().advance(samplingInterval.negated().multipliedBy(measurements)); - addQueryRateMeasurements(measurements, t -> t == 0 ? 200.0 : 100.0); // Query traffic only + public Duration applyCpuLoad(double cpuLoad, int measurements) { + Duration duration = addCpuMeasurements((float)cpuLoad, measurements); + fixture.tester().clock().advance(duration.negated()); + return addQueryRateMeasurements(measurements, t -> t == 0 ? 200.0 : 100.0); // Query traffic only } public void applyMemLoad(double memLoad, int measurements) { diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/persistence/ApplicationSerializerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/persistence/ApplicationSerializerTest.java index bce10b999bb..c8dc0d97320 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/persistence/ApplicationSerializerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/persistence/ApplicationSerializerTest.java @@ -8,6 +8,7 @@ import com.yahoo.config.provision.ClusterSpec; import com.yahoo.config.provision.NodeResources; import com.yahoo.vespa.hosted.provision.applications.Application; import com.yahoo.vespa.hosted.provision.applications.Cluster; +import com.yahoo.vespa.hosted.provision.applications.BcpGroupInfo; import com.yahoo.vespa.hosted.provision.applications.ScalingEvent; import com.yahoo.vespa.hosted.provision.applications.Status; import com.yahoo.vespa.hosted.provision.autoscale.Autoscaling; @@ -39,6 +40,7 @@ public class ApplicationSerializerTest { true, Autoscaling.empty(), Autoscaling.empty(), + BcpGroupInfo.empty(), List.of())); var minResources = new NodeResources(1, 2, 3, 4); clusters.add(new Cluster(ClusterSpec.Id.from("c2"), @@ -61,6 +63,7 @@ public class ApplicationSerializerTest { Instant.ofEpochMilli(5678L), Load.zero(), Load.one()), + new BcpGroupInfo(0.1, 0.2, 0.3), List.of(new ScalingEvent(new ClusterResources(10, 5, minResources), new ClusterResources(12, 6, minResources), 7L, @@ -90,6 +93,7 @@ public class ApplicationSerializerTest { assertEquals(originalCluster.required(), serializedCluster.required()); assertEquals(originalCluster.suggested(), serializedCluster.suggested()); assertEquals(originalCluster.target(), serializedCluster.target()); + assertEquals(originalCluster.bcpGroupInfo(), serializedCluster.bcpGroupInfo()); assertEquals(originalCluster.scalingEvents(), serializedCluster.scalingEvents()); } } -- cgit v1.2.3