aboutsummaryrefslogtreecommitdiffstats
path: root/node-repository
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@gmail.com>2023-02-13 12:23:55 +0100
committerJon Bratseth <bratseth@gmail.com>2023-02-13 12:23:55 +0100
commit0ba8b0001cee9ae1aad8fbdfac863a79da212d1c (patch)
tree2903f5356a7efa3646b1bce6c16e000286335f06 /node-repository
parentfa526bcc311ae6080905b61fb9248aca82aa4991 (diff)
Cold autoscaling WIP
Core support for autoscaling using data from cluster deployments in other regions when there is little or no traffic in our own deployment.
Diffstat (limited to 'node-repository')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/BcpGroupInfo.java66
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/Cluster.java22
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java157
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Load.java4
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/ApplicationSerializer.java22
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java2
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingUsingBcpGroupInfoTest.java238
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Fixture.java7
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Loader.java8
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/persistence/ApplicationSerializerTest.java4
10 files changed, 460 insertions, 70 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/BcpGroupInfo.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/BcpGroupInfo.java
new file mode 100644
index 00000000000..6b0ea8532be
--- /dev/null
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/BcpGroupInfo.java
@@ -0,0 +1,66 @@
+package com.yahoo.vespa.hosted.provision.applications;
+
+import java.util.Objects;
+
+/**
+ * When there are multiple deployments of an application in different regions,
+ * instances of the cluster across regions may form a "BCP group".
+ * By default the clusters in all production regions form such a group, but other arrangements
+ * may be specified in deployment.xml, see com.yahoo.config.application.api.Bcp.
+ *
+ * This contains metrics averaged over the other clusters in the group this belongs to,
+ * which is used to amend scaling decisions in this cluster when it has little traffic on its own.
+ *
+ * @author bratseth
+ */
+public class BcpGroupInfo {
+
+ private static final BcpGroupInfo empty = new BcpGroupInfo(0, 0, 0);
+
+ private final double queryRate;
+ private final double growthRateHeadroom;
+ private final double cpuCostPerQuery;
+
+ public BcpGroupInfo(double queryRate, double growthRateHeadroom, double cpuCostPerQuery) {
+ this.queryRate = queryRate;
+ this.growthRateHeadroom = growthRateHeadroom;
+ this.cpuCostPerQuery = cpuCostPerQuery;
+ }
+
+ /** Returns the average query rate (queries/second) of the other clusters in the group this belongs to. */
+ public double queryRate() { return queryRate; }
+
+ /** Returns the average growth rate headroom of the other clusters in the group this belongs to. */
+ public double growthRateHeadroom() { return growthRateHeadroom; }
+
+ /** Returns the average total cluster CPU cost per query of the other clusters in the group this belongs to. */
+ public double cpuCostPerQuery() { return cpuCostPerQuery; }
+
+ public boolean isEmpty() {
+ return queryRate == 0 && growthRateHeadroom == 0 && cpuCostPerQuery == 0;
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) return true;
+ if ( ! (o instanceof BcpGroupInfo other)) return false;
+ if ( other.queryRate != this.queryRate) return false;
+ if ( other.growthRateHeadroom != this.growthRateHeadroom) return false;
+ if ( other.cpuCostPerQuery != this.cpuCostPerQuery) return false;
+ return true;
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(queryRate, growthRateHeadroom, cpuCostPerQuery);
+ }
+
+ @Override
+ public String toString() {
+ return "BCP group info: " + queryRate + " q/s, " + growthRateHeadroom + " q/s headroom, " +
+ cpuCostPerQuery + " CPU cost per q/s";
+ }
+
+ public static BcpGroupInfo empty() { return empty; }
+
+}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/Cluster.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/Cluster.java
index 8da6bd6937b..ea4944c2bd5 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/Cluster.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/Cluster.java
@@ -33,6 +33,7 @@ public class Cluster {
private final boolean required;
private final Autoscaling suggested;
private final Autoscaling target;
+ private final BcpGroupInfo bcpGroupInfo;
/** The maxScalingEvents last scaling events of this, sorted by increasing time (newest last) */
private final List<ScalingEvent> scalingEvents;
@@ -45,6 +46,7 @@ public class Cluster {
boolean required,
Autoscaling suggested,
Autoscaling target,
+ BcpGroupInfo bcpGroupInfo,
List<ScalingEvent> scalingEvents) {
this.id = Objects.requireNonNull(id);
this.exclusive = exclusive;
@@ -58,6 +60,7 @@ public class Cluster {
this.target = Autoscaling.empty();
else
this.target = target;
+ this.bcpGroupInfo = Objects.requireNonNull(bcpGroupInfo);
this.scalingEvents = List.copyOf(scalingEvents);
}
@@ -77,7 +80,7 @@ public class Cluster {
/**
* Returns whether the resources of this cluster are required to be within the specified min and max.
- * Otherwise they may be adjusted by capacity policies.
+ * Otherwise, they may be adjusted by capacity policies.
*/
public boolean required() { return required; }
@@ -102,6 +105,9 @@ public class Cluster {
return true;
}
+ /** Returns info about the BCP group of clusters this belongs to. */
+ public BcpGroupInfo bcpGroupInfo() { return bcpGroupInfo; }
+
/** Returns the recent scaling events in this cluster */
public List<ScalingEvent> scalingEvents() { return scalingEvents; }
@@ -113,15 +119,19 @@ public class Cluster {
public Cluster withConfiguration(boolean exclusive, Capacity capacity) {
return new Cluster(id, exclusive,
capacity.minResources(), capacity.maxResources(), capacity.groupSize(), capacity.isRequired(),
- suggested, target, scalingEvents);
+ suggested, target, bcpGroupInfo, scalingEvents);
}
public Cluster withSuggested(Autoscaling suggested) {
- return new Cluster(id, exclusive, min, max, groupSize, required, suggested, target, scalingEvents);
+ return new Cluster(id, exclusive, min, max, groupSize, required, suggested, target, bcpGroupInfo, scalingEvents);
}
public Cluster withTarget(Autoscaling target) {
- return new Cluster(id, exclusive, min, max, groupSize, required, suggested, target, scalingEvents);
+ return new Cluster(id, exclusive, min, max, groupSize, required, suggested, target, bcpGroupInfo, scalingEvents);
+ }
+
+ public Cluster with(BcpGroupInfo bcpGroupInfo) {
+ return new Cluster(id, exclusive, min, max, groupSize, required, suggested, target, bcpGroupInfo, scalingEvents);
}
/** Add or update (based on "at" time) a scaling event */
@@ -135,7 +145,7 @@ public class Cluster {
scalingEvents.add(scalingEvent);
prune(scalingEvents);
- return new Cluster(id, exclusive, min, max, groupSize, required, suggested, target, scalingEvents);
+ return new Cluster(id, exclusive, min, max, groupSize, required, suggested, target, bcpGroupInfo, scalingEvents);
}
@Override
@@ -167,7 +177,7 @@ public class Cluster {
public static Cluster create(ClusterSpec.Id id, boolean exclusive, Capacity requested) {
return new Cluster(id, exclusive,
requested.minResources(), requested.maxResources(), requested.groupSize(), requested.isRequired(),
- Autoscaling.empty(), Autoscaling.empty(), List.of());
+ Autoscaling.empty(), Autoscaling.empty(), BcpGroupInfo.empty(), List.of());
}
/** The predicted time it will take to rescale this cluster. */
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java
index da74ad0b63b..4edcdbd3fa5 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java
@@ -30,6 +30,9 @@ public class ClusterModel {
/** Containers typically use more cpu right after generation change, so discard those metrics */
public static final Duration warmupDuration = Duration.ofMinutes(7);
+ /** If we have less than this query rate, we cannot be fully confident in our load data, which influences some decisions. */
+ public static final double queryRateGivingFullConfidence = 100.0;
+
static final double idealQueryCpuLoad = 0.8;
static final double idealWriteCpuLoad = 0.95;
@@ -48,8 +51,13 @@ public class ClusterModel {
private final Application application;
private final ClusterSpec clusterSpec;
private final Cluster cluster;
- /** The current nodes of this cluster, or empty if this models a new cluster not yet deployed */
+
+ /**
+ * The current active nodes of this cluster, including retired,
+ * or empty if this models a new cluster not yet deployed.
+ */
private final NodeList nodes;
+
private final Clock clock;
private final Duration scalingDuration;
private final ClusterTimeseries clusterTimeseries;
@@ -118,6 +126,14 @@ public class ClusterModel {
return adjustment;
}
+ public OptionalDouble cpuCostPerQuery() {
+ if (averageQueryRate().isEmpty()) return OptionalDouble.empty();
+ // TODO: Query rate should generally be sampled at the time where we see the peak resource usage
+ int fanOut = clusterSpec.type().isContainer() ? 1 : groupSize();
+ return OptionalDouble.of(peakLoad().cpu() * queryCpuFraction() * fanOut * nodes.not().retired().first().get().resources().vcpu()
+ / averageQueryRate().getAsDouble() / groupCount());
+ }
+
public boolean isStable(NodeRepository nodeRepository) {
// An autoscaling decision was recently made
if (hasScaledIn(Duration.ofMinutes(5)))
@@ -143,59 +159,12 @@ public class ClusterModel {
return true;
}
- private boolean hasScaledIn(Duration period) {
- return cluster.lastScalingEvent().map(event -> event.at()).orElse(Instant.MIN)
- .isAfter(clock.instant().minus(period));
- }
-
/** Returns the predicted duration of a rescaling of this cluster */
public Duration scalingDuration() { return scalingDuration; }
- public ClusterNodesTimeseries nodeTimeseries() { return nodeTimeseries; }
-
- public ClusterTimeseries clusterTimeseries() { return clusterTimeseries; }
-
- /**
- * Returns the predicted max query growth rate per minute as a fraction of the average traffic
- * in the scaling window.
- */
- public double maxQueryGrowthRate() {
- if (maxQueryGrowthRate != null) return maxQueryGrowthRate;
- return maxQueryGrowthRate = clusterTimeseries().maxQueryGrowthRate(scalingDuration(), clock);
- }
-
- /** Returns the average query rate in the scaling window as a fraction of the max observed query rate. */
- public double queryFractionOfMax() {
- if (queryFractionOfMax != null) return queryFractionOfMax;
- return queryFractionOfMax = clusterTimeseries().queryFractionOfMax(scalingDuration(), clock);
- }
-
- /** Returns the average query rate in the scaling window. */
- public OptionalDouble averageQueryRate() {
- if (averageQueryRate != null) return averageQueryRate;
- return averageQueryRate = clusterTimeseries().queryRate(scalingDuration(), clock);
- }
-
/** Returns the average of the peak load measurement in each dimension, from each node. */
- public Load peakLoad() { return nodeTimeseries().peakLoad(); }
-
- /** The number of nodes this cluster has, or will have if not deployed yet. */
- // TODO: Make this the deployed, not current count
- public int nodeCount() {
- if ( ! nodes.isEmpty()) return (int)nodes.stream().count();
- return cluster.minResources().nodes();
- }
-
- /** The number of groups this cluster has, or will have if not deployed yet. */
- // TODO: Make this the deployed, not current count
- public int groupCount() {
- if ( ! nodes.isEmpty()) return (int)nodes.stream().mapToInt(node -> node.allocation().get().membership().cluster().group().get().index()).distinct().count();
- return cluster.minResources().groups();
- }
-
- public int groupSize() {
- // ceil: If the division does not produce a whole number we assume some node is missing
- return (int)Math.ceil((double)nodeCount() / groupCount());
+ public Load peakLoad() {
+ return nodeTimeseries().peakLoad();
}
/** Returns the relative load adjustment accounting for redundancy in this. */
@@ -235,15 +204,88 @@ public class ClusterModel {
* if one of the nodes go down.
*/
public Load idealLoad() {
- return new Load(idealCpuLoad(), idealMemoryLoad(), idealDiskLoad()).divide(redundancyAdjustment());
+ var ideal = new Load(idealCpuLoad(), idealMemoryLoad(), idealDiskLoad()).divide(redundancyAdjustment());
+ if (! cluster.bcpGroupInfo().isEmpty()) {
+ // Do a weighted sum of the ideal "vote" based on local and bcp group info.
+ // This avoids any discontinuities with a near-zero local query rate.
+ double localInformationWeight = Math.min(1, averageQueryRate().orElse(0) /
+ Math.min(queryRateGivingFullConfidence, cluster.bcpGroupInfo().queryRate()));
+ Load bcpGroupIdeal = adjustQueryDependentIdealLoadByBcpGroupInfo(ideal);
+ ideal = ideal.multiply(localInformationWeight).add(bcpGroupIdeal.multiply(1 - localInformationWeight));
+ }
+ return ideal;
}
- public int nodesAdjustedForRedundancy(int nodes, int groups) {
+ /** Returns the instant this model was created. */
+ public Instant at() { return at;}
+
+ private Load adjustQueryDependentIdealLoadByBcpGroupInfo(Load ideal) {
+ double currentClusterTotalVcpuPerGroup = nodes.not().retired().first().get().resources().vcpu() * groupSize();
+
+ double targetQueryRateToHandle = cluster.bcpGroupInfo().queryRate() * cluster.bcpGroupInfo().growthRateHeadroom() * trafficShiftHeadroom();
+ double neededTotalVcpPerGroup = cluster.bcpGroupInfo().cpuCostPerQuery() * targetQueryRateToHandle / groupCount() +
+ ( 1 - queryCpuFraction()) * idealCpuLoad() *
+ (clusterSpec.type().isContainer() ? 1 : groupSize());
+
+ double cpuAdjustment = neededTotalVcpPerGroup / currentClusterTotalVcpuPerGroup;
+ return ideal.withCpu(peakLoad().cpu() / cpuAdjustment);
+ }
+
+ private boolean hasScaledIn(Duration period) {
+ return cluster.lastScalingEvent().map(event -> event.at()).orElse(Instant.MIN)
+ .isAfter(clock.instant().minus(period));
+ }
+
+ private ClusterNodesTimeseries nodeTimeseries() { return nodeTimeseries; }
+
+ private ClusterTimeseries clusterTimeseries() { return clusterTimeseries; }
+
+ /**
+ * Returns the predicted max query growth rate per minute as a fraction of the average traffic
+ * in the scaling window.
+ */
+ private double maxQueryGrowthRate() {
+ if (maxQueryGrowthRate != null) return maxQueryGrowthRate;
+ return maxQueryGrowthRate = clusterTimeseries().maxQueryGrowthRate(scalingDuration(), clock);
+ }
+
+ /** Returns the average query rate in the scaling window as a fraction of the max observed query rate. */
+ private double queryFractionOfMax() {
+ if (queryFractionOfMax != null) return queryFractionOfMax;
+ return queryFractionOfMax = clusterTimeseries().queryFractionOfMax(scalingDuration(), clock);
+ }
+
+ /** Returns the average query rate in the scaling window. */
+ private OptionalDouble averageQueryRate() {
+ if (averageQueryRate != null) return averageQueryRate;
+ return averageQueryRate = clusterTimeseries().queryRate(scalingDuration(), clock);
+ }
+
+ /** The number of nodes this cluster has, or will have if not deployed yet. */
+ // TODO: Make this the deployed, not current count
+ private int nodeCount() {
+ if ( ! nodes.isEmpty()) return (int)nodes.stream().count();
+ return cluster.minResources().nodes();
+ }
+
+ /** The number of groups this cluster has, or will have if not deployed yet. */
+ // TODO: Make this the deployed, not current count
+ private int groupCount() {
+ if ( ! nodes.isEmpty()) return (int)nodes.stream().mapToInt(node -> node.allocation().get().membership().cluster().group().get().index()).distinct().count();
+ return cluster.minResources().groups();
+ }
+
+ private int groupSize() {
+ // ceil: If the division does not produce a whole number we assume some node is missing
+ return (int)Math.ceil((double)nodeCount() / groupCount());
+ }
+
+ private int nodesAdjustedForRedundancy(int nodes, int groups) {
int groupSize = (int)Math.ceil((double)nodes / groups);
return nodes > 1 ? (groups == 1 ? nodes - 1 : nodes - groupSize) : nodes;
}
- public int groupsAdjustedForRedundancy(int nodes, int groups) {
+ private int groupsAdjustedForRedundancy(int nodes, int groups) {
return nodes > 1 ? (groups == 1 ? 1 : groups - 1) : groups;
}
@@ -258,9 +300,6 @@ public class ClusterModel {
(1 - queryCpuFraction) * idealWriteCpuLoad;
}
- /** Returns the instant this model was created. */
- public Instant at() { return at;}
-
/** Returns the headroom for growth during organic traffic growth as a multiple of current resources. */
private double growthRateHeadroom() {
if ( ! zone.environment().isProduction()) return 1;
@@ -280,7 +319,7 @@ public class ClusterModel {
if ( ! zone.environment().isProduction()) return 1;
double trafficShiftHeadroom;
if (application.status().maxReadShare() == 0) // No traffic fraction data
- trafficShiftHeadroom = 2.0; // assume we currently get half of the global share of traffic
+ trafficShiftHeadroom = 2.0; // assume we currently get half of the max possible share of traffic
else if (application.status().currentReadShare() == 0)
trafficShiftHeadroom = 1/application.status().maxReadShare();
else
@@ -294,11 +333,11 @@ public class ClusterModel {
* with high confidence to avoid large adjustments caused by random noise due to low traffic numbers.
*/
private double adjustByConfidence(double headroom) {
- return ( (headroom -1 ) * Math.min(1, averageQueryRate().orElse(0) / 100.0) ) + 1;
+ return ( (headroom -1 ) * Math.min(1, averageQueryRate().orElse(0) / queryRateGivingFullConfidence) ) + 1;
}
/** The estimated fraction of cpu usage which goes to processing queries vs. writes */
- public double queryCpuFraction() {
+ private double queryCpuFraction() {
OptionalDouble writeRate = clusterTimeseries().writeRate(scalingDuration(), clock);
if (averageQueryRate().orElse(0) == 0 && writeRate.orElse(0) == 0) return queryCpuFraction(0.5);
return queryCpuFraction(averageQueryRate().orElse(0) / (averageQueryRate().orElse(0) + writeRate.orElse(0)));
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Load.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Load.java
index 6ab5ff731d3..a2fa6e63922 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Load.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Load.java
@@ -30,6 +30,10 @@ public class Load {
public double memory() { return memory; }
public double disk() { return disk; }
+ public Load withCpu(double cpu) { return new Load(cpu, memory, disk); }
+ public Load withMemory(double memory) { return new Load(cpu, memory, disk); }
+ public Load withDisk(double disk) { return new Load(cpu, memory, disk); }
+
public Load add(Load other) {
return join(other, (a, b) -> a + b);
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/ApplicationSerializer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/ApplicationSerializer.java
index 469cabc4ee4..1b73dee8b6c 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/ApplicationSerializer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/ApplicationSerializer.java
@@ -12,6 +12,7 @@ import com.yahoo.slime.Slime;
import com.yahoo.slime.SlimeUtils;
import com.yahoo.vespa.hosted.provision.applications.Application;
import com.yahoo.vespa.hosted.provision.applications.Cluster;
+import com.yahoo.vespa.hosted.provision.applications.BcpGroupInfo;
import com.yahoo.vespa.hosted.provision.applications.ScalingEvent;
import com.yahoo.vespa.hosted.provision.applications.Status;
import com.yahoo.vespa.hosted.provision.autoscale.Autoscaling;
@@ -53,6 +54,10 @@ public class ApplicationSerializer {
private static final String groupSizeKey = "groupSize";
private static final String requiredKey = "required";
private static final String suggestedKey = "suggested";
+ private static final String bcpGroupInfoKey = "bcpGroupInfo";
+ private static final String queryRateKey = "queryRateKey";
+ private static final String growthRateHeadroomKey = "growthRateHeadroomKey";
+ private static final String cpuCostPerQueryKey = "cpuCostPerQueryKey";
private static final String resourcesKey = "resources";
private static final String targetKey = "target";
private static final String nodesKey = "nodes";
@@ -129,6 +134,8 @@ public class ApplicationSerializer {
clusterObject.setBool(requiredKey, cluster.required());
toSlime(cluster.suggested(), clusterObject.setObject(suggestedKey));
toSlime(cluster.target(), clusterObject.setObject(targetKey));
+ if (! cluster.bcpGroupInfo().isEmpty())
+ toSlime(cluster.bcpGroupInfo(), clusterObject.setObject(bcpGroupInfoKey));
scalingEventsToSlime(cluster.scalingEvents(), clusterObject.setArray(scalingEventsKey));
}
@@ -141,6 +148,7 @@ public class ApplicationSerializer {
clusterObject.field(requiredKey).asBool(),
autoscalingFromSlime(clusterObject.field(suggestedKey), clusterObject.field("nonExisting")),
autoscalingFromSlime(clusterObject.field(targetKey), clusterObject.field(autoscalingStatusObjectKey)),
+ bcpGroupInfoFromSlime(clusterObject.field(bcpGroupInfoKey)),
scalingEventsFromSlime(clusterObject.field(scalingEventsKey)));
}
@@ -222,6 +230,20 @@ public class ApplicationSerializer {
loadFromSlime(autoscalingObject.field(idealKey)));
}
+ private static void toSlime(BcpGroupInfo bcpGroupInfo, Cursor bcpGroupInfoObject) {
+ if (bcpGroupInfo.isEmpty()) return;
+ bcpGroupInfoObject.setDouble(queryRateKey, bcpGroupInfo.queryRate());
+ bcpGroupInfoObject.setDouble(growthRateHeadroomKey, bcpGroupInfo.growthRateHeadroom());
+ bcpGroupInfoObject.setDouble(cpuCostPerQueryKey, bcpGroupInfo.cpuCostPerQuery());
+ }
+
+ private static BcpGroupInfo bcpGroupInfoFromSlime(Inspector bcpGroupInfoObject) {
+ if ( ! bcpGroupInfoObject.valid()) return BcpGroupInfo.empty();
+ return new BcpGroupInfo(bcpGroupInfoObject.field(queryRateKey).asDouble(),
+ bcpGroupInfoObject.field(growthRateHeadroomKey).asDouble(),
+ bcpGroupInfoObject.field(cpuCostPerQueryKey).asDouble());
+ }
+
private static void scalingEventsToSlime(List<ScalingEvent> scalingEvents, Cursor eventArray) {
scalingEvents.forEach(event -> toSlime(event, eventArray.addObject()));
}
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java
index 05d0822758d..19c6ce16674 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java
@@ -3,7 +3,6 @@ package com.yahoo.vespa.hosted.provision.autoscale;
import com.yahoo.config.provision.ApplicationId;
import com.yahoo.config.provision.Capacity;
-import com.yahoo.config.provision.Cloud;
import com.yahoo.config.provision.ClusterResources;
import com.yahoo.config.provision.ClusterSpec;
import com.yahoo.config.provision.Flavor;
@@ -135,6 +134,7 @@ class AutoscalingTester {
cluster.required(),
cluster.suggested(),
cluster.target(),
+ cluster.bcpGroupInfo(),
List.of()); // Remove scaling events
cluster = cluster.with(ScalingEvent.create(cluster.minResources(), cluster.minResources(),
0,
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingUsingBcpGroupInfoTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingUsingBcpGroupInfoTest.java
new file mode 100644
index 00000000000..0bd94872557
--- /dev/null
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingUsingBcpGroupInfoTest.java
@@ -0,0 +1,238 @@
+package com.yahoo.vespa.hosted.provision.autoscale;
+
+import com.yahoo.config.provision.Capacity;
+import com.yahoo.config.provision.ClusterResources;
+import com.yahoo.config.provision.ClusterSpec;
+import com.yahoo.config.provision.NodeResources;
+import com.yahoo.vespa.hosted.provision.applications.BcpGroupInfo;
+import org.junit.Test;
+
+import java.time.Duration;
+import java.util.Optional;
+
+/**
+ * Tests autoscaling using information from the BCP group this cluster deployment
+ * is part of to supplement local data when the local deployment lacks sufficient traffic.
+ *
+ * @author bratseth
+ */
+public class AutoscalingUsingBcpGroupInfoTest {
+
+ /** Tests with varying BCP group info parameters. */
+ @Test
+ public void test_autoscaling_single_content_group() {
+ var fixture = AutoscalingTester.fixture().awsProdSetup(true).build();
+
+ fixture.tester().clock().advance(Duration.ofDays(2));
+ fixture.store(new BcpGroupInfo(100, 1.1, 0.3));
+ fixture.loader().addCpuMeasurements(0.7f, 10);
+ fixture.tester().assertResources("Scaling up cpu using bcp group cpu info",
+ 8, 1, 4.0, 7.6, 37.8,
+ fixture.autoscale());
+
+ // Higher query rate (mem and disk changes are due to being assigned larger hosts where we get less overhead share
+ fixture.tester().clock().advance(Duration.ofDays(2));
+ fixture.store(new BcpGroupInfo(200, 1.1, 0.3));
+ fixture.loader().addCpuMeasurements(0.7f, 10);
+ fixture.tester().assertResources("Scaling up cpu using bcp group cpu info",
+ 8, 1, 8.0, 7.4, 32.8,
+ fixture.autoscale());
+
+ // Higher headroom
+ fixture.tester().clock().advance(Duration.ofDays(2));
+ fixture.store(new BcpGroupInfo(100, 1.3, 0.3));
+ fixture.loader().addCpuMeasurements(0.7f, 10);
+ fixture.tester().assertResources("Scaling up cpu using bcp group cpu info",
+ 9, 1, 4.2, 6.6, 33.1,
+ fixture.autoscale());
+
+ // Higher per query cost
+ fixture.tester().clock().advance(Duration.ofDays(2));
+ fixture.store(new BcpGroupInfo(100, 1.1, 0.45));
+ fixture.loader().addCpuMeasurements(0.7f, 10);
+ fixture.tester().assertResources("Scaling up cpu using bcp group cpu info",
+ 9, 1, 5.4, 6.6, 33.1,
+ fixture.autoscale());
+ }
+
+ /** Tests with varying BCP group info parameters. */
+ @Test
+ public void test_autoscaling_multiple_content_groups() {
+ var min = new ClusterResources(3, 3,
+ new NodeResources(1, 4, 10, 1, NodeResources.DiskSpeed.any));
+ var max = new ClusterResources(21, 3,
+ new NodeResources(100, 1000, 1000, 1, NodeResources.DiskSpeed.any));
+ var fixture = AutoscalingTester.fixture()
+ .awsProdSetup(true)
+ .initialResources(Optional.of(new ClusterResources(9, 3, new NodeResources(2, 16, 75, 1))))
+ .capacity(Capacity.from(min, max))
+ .build();
+
+ fixture.tester().clock().advance(Duration.ofDays(2));
+ fixture.store(new BcpGroupInfo(100, 1.1, 0.3));
+ fixture.loader().addCpuMeasurements(0.7f, 10);
+ fixture.tester().assertResources("Scaling up cpu using bcp group cpu info",
+ 3, 3, 10.5, 42.3, 187.0,
+ fixture.autoscale());
+
+ // Higher query rate (mem and disk changes are due to being assigned larger hosts where we get less overhead share
+ fixture.tester().clock().advance(Duration.ofDays(2));
+ fixture.store(new BcpGroupInfo(200, 1.1, 0.3));
+ fixture.loader().addCpuMeasurements(0.7f, 10);
+ fixture.tester().assertResources("Scaling up cpu using bcp group cpu info",
+ 3, 3, 20.9, 42.3, 178.0,
+ fixture.autoscale());
+
+ // Higher headroom
+ fixture.tester().clock().advance(Duration.ofDays(2));
+ fixture.store(new BcpGroupInfo(100, 1.3, 0.3));
+ fixture.loader().addCpuMeasurements(0.7f, 10);
+ fixture.tester().assertResources("Scaling up cpu using bcp group cpu info",
+ 3, 3, 12.4, 42.3, 187.0,
+ fixture.autoscale());
+
+ // Higher per query cost
+ fixture.tester().clock().advance(Duration.ofDays(2));
+ fixture.store(new BcpGroupInfo(100, 1.1, 0.45));
+ fixture.loader().addCpuMeasurements(0.7f, 10);
+ fixture.tester().assertResources("Scaling up cpu using bcp group cpu info",
+ 3, 3, 15.7, 42.3, 187.0,
+ fixture.autoscale());
+ }
+
+ /**
+ * Tests with varying BCP group info parameters for containers.
+ * Differences from content
+ * - No host sharing.
+ * - Memory and disk is independent of cluster size.
+ */
+ @Test
+ public void test_autoscaling_container() {
+ var fixture = AutoscalingTester.fixture().clusterType(ClusterSpec.Type.container).awsProdSetup(true).build();
+
+ fixture.tester().clock().advance(Duration.ofDays(2));
+ fixture.store(new BcpGroupInfo(100, 1.1, 0.3));
+ fixture.loader().addCpuMeasurements(0.7f, 10);
+ fixture.tester().assertResources("Scaling up cpu using bcp group cpu info",
+ 8, 1, 4.0, 16.0, 40.8,
+ fixture.autoscale());
+
+ // Higher query rate (mem and disk changes are due to being assigned larger hosts where we get less overhead share
+ fixture.tester().clock().advance(Duration.ofDays(2));
+ fixture.store(new BcpGroupInfo(200, 1.1, 0.3));
+ fixture.loader().addCpuMeasurements(0.7f, 10);
+ fixture.tester().assertResources("Scaling up cpu using bcp group cpu info",
+ 8, 1, 8.0, 16.0, 40.8,
+ fixture.autoscale());
+
+ // Higher headroom
+ fixture.tester().clock().advance(Duration.ofDays(2));
+ fixture.store(new BcpGroupInfo(100, 1.3, 0.3));
+ fixture.loader().addCpuMeasurements(0.7f, 10);
+ fixture.tester().assertResources("Scaling up cpu using bcp group cpu info",
+ 5, 1, 8.0, 16.0, 40.8,
+ fixture.autoscale());
+
+ // Higher per query cost
+ fixture.tester().clock().advance(Duration.ofDays(2));
+ fixture.store(new BcpGroupInfo(100, 1.1, 0.45));
+ fixture.loader().addCpuMeasurements(0.7f, 10);
+ fixture.tester().assertResources("Scaling up cpu using bcp group cpu info",
+ 6, 1, 8.0, 16.0, 40.8,
+ fixture.autoscale());
+ }
+
+ @Test
+ public void test_autoscaling_single_content_group_with_some_local_traffic() {
+ var fixture = AutoscalingTester.fixture().awsProdSetup(true).build();
+
+ // Baseline: No local traffic, group traffic indicates much higher cpu usage than local
+ fixture.tester().clock().advance(Duration.ofDays(2));
+ fixture.store(new BcpGroupInfo(200, 1.3, 0.45));
+ fixture.loader().addCpuMeasurements(0.7f, 10);
+ fixture.tester().assertResources("Scaling up cpu using bcp group cpu info",
+ 8, 1, 14.2, 7.4, 32.8,
+ fixture.autoscale());
+
+ // Some local traffic
+ fixture.tester().clock().advance(Duration.ofDays(2));
+ fixture.store(new BcpGroupInfo(200, 1.3, 0.45));
+ Duration duration1 = fixture.loader().addCpuMeasurements(0.7f, 10);
+ fixture.tester().clock().advance(duration1.negated());
+ fixture.loader().addQueryRateMeasurements(10, __ -> 10.0);
+ fixture.tester().assertResources("Scaling up cpu using bcp group cpu info",
+ 8, 1, 6.9, 7.6, 37.8,
+ fixture.autoscale());
+
+ // Enough local traffic to get half the votes
+ fixture.tester().clock().advance(Duration.ofDays(2));
+ fixture.store(new BcpGroupInfo(200, 1.3, 0.45));
+ Duration duration2 = fixture.loader().addCpuMeasurements(0.7f, 10);
+ fixture.tester().clock().advance(duration2.negated());
+ fixture.loader().addQueryRateMeasurements(10, __ -> 50.0);
+ fixture.tester().assertResources("Scaling up cpu using bcp group cpu info",
+ 7, 1, 3.5, 8.9, 55.5,
+ fixture.autoscale());
+
+ // Mostly local
+ fixture.tester().clock().advance(Duration.ofDays(2));
+ fixture.store(new BcpGroupInfo(200, 1.3, 0.45));
+ Duration duration3 = fixture.loader().addCpuMeasurements(0.7f, 10);
+ fixture.tester().clock().advance(duration3.negated());
+ fixture.loader().addQueryRateMeasurements(10, __ -> 90.0);
+ fixture.tester().assertResources("Scaling up cpu using bcp group cpu info",
+ 7, 1, 2.7, 8.9, 55.5,
+ fixture.autoscale());
+
+ // Local only
+ fixture.tester().clock().advance(Duration.ofDays(2));
+ fixture.store(new BcpGroupInfo(200, 1.3, 0.45));
+ Duration duration4 = fixture.loader().addCpuMeasurements(0.7f, 10);
+ fixture.tester().clock().advance(duration4.negated());
+ fixture.loader().addQueryRateMeasurements(10, __ -> 100.0);
+ fixture.tester().assertResources("Scaling up cpu using bcp group cpu info",
+ 7, 1, 2.6, 8.9, 55.5,
+ fixture.autoscale());
+
+ // No group info, should be the same as the above
+ fixture.tester().clock().advance(Duration.ofDays(2));
+ fixture.store(BcpGroupInfo.empty());
+ Duration duration5 = fixture.loader().addCpuMeasurements(0.7f, 10);
+ fixture.tester().clock().advance(duration5.negated());
+ fixture.loader().addQueryRateMeasurements(10, __ -> 100.0);
+ fixture.tester().assertResources("Scaling up cpu using bcp group cpu info",
+ 7, 1, 2.6, 8.9, 55.5,
+ fixture.autoscale());
+
+ // 40 query rate, no group info (for reference to the below)
+ fixture.tester().clock().advance(Duration.ofDays(2));
+ fixture.store(BcpGroupInfo.empty());
+ Duration duration6 = fixture.loader().addCpuMeasurements(0.7f, 10);
+ fixture.tester().clock().advance(duration6.negated());
+ fixture.loader().addQueryRateMeasurements(10, __ -> 40.0);
+ fixture.tester().assertResources("Scaling up cpu using bcp group cpu info",
+ 6, 1, 2.2, 10.6, 66.5,
+ fixture.autoscale());
+
+ // Local query rate is too low but global is even lower so disregard it, giving the same as above
+ fixture.tester().clock().advance(Duration.ofDays(2));
+ fixture.store(new BcpGroupInfo(200/40.0, 1.3, 0.45*40.0));
+ Duration duration7 = fixture.loader().addCpuMeasurements(0.7f, 10);
+ fixture.tester().clock().advance(duration7.negated());
+ fixture.loader().addQueryRateMeasurements(10, __ -> 40.0);
+ fixture.tester().assertResources("Scaling up cpu using bcp group cpu info",
+ 6, 1, 2.2, 10.6, 66.5,
+ fixture.autoscale());
+
+ // Local query rate is too low to be fully confident, and so is global but as it is slightly larger, incorporate it slightly
+ fixture.tester().clock().advance(Duration.ofDays(2));
+ fixture.store(new BcpGroupInfo(200/4.0, 1.3, 0.45*4.0));
+ Duration duration8 = fixture.loader().addCpuMeasurements(0.7f, 10);
+ fixture.tester().clock().advance(duration8.negated());
+ fixture.loader().addQueryRateMeasurements(10, __ -> 40.0);
+ fixture.tester().assertResources("Scaling up cpu using bcp group cpu info",
+ 7, 1, 2.2, 8.9, 55.5,
+ fixture.autoscale());
+ }
+
+}
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Fixture.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Fixture.java
index 1e1e00a10db..5caf50a4e83 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Fixture.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Fixture.java
@@ -21,6 +21,7 @@ import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeList;
import com.yahoo.vespa.hosted.provision.applications.Application;
import com.yahoo.vespa.hosted.provision.applications.Cluster;
+import com.yahoo.vespa.hosted.provision.applications.BcpGroupInfo;
import com.yahoo.vespa.hosted.provision.autoscale.awsnodes.AwsHostResourcesCalculatorImpl;
import com.yahoo.vespa.hosted.provision.autoscale.awsnodes.AwsNodeTypes;
import com.yahoo.vespa.hosted.provision.provisioning.HostResourcesCalculator;
@@ -132,6 +133,12 @@ public class Fixture {
tester.nodeRepository().applications().put(application, tester.nodeRepository().applications().lock(applicationId));
}
+ public void store(BcpGroupInfo bcpGroupInfo) {
+ var application = application();
+ application = application.with(application.cluster(clusterId()).get().with(bcpGroupInfo));
+ tester.nodeRepository().applications().put(application, tester.nodeRepository().applications().lock(applicationId));
+ }
+
public static class Builder {
ApplicationId application = AutoscalingTester.applicationId("application1");
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Loader.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Loader.java
index 10c8c7434b1..a104f0b1bc8 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Loader.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Loader.java
@@ -79,10 +79,10 @@ public class Loader {
return Duration.between(initialTime, fixture.tester().clock().instant());
}
- public void applyCpuLoad(double cpuLoad, int measurements) {
- addCpuMeasurements((float)cpuLoad, measurements);
- fixture.tester().clock().advance(samplingInterval.negated().multipliedBy(measurements));
- addQueryRateMeasurements(measurements, t -> t == 0 ? 200.0 : 100.0); // Query traffic only
+ public Duration applyCpuLoad(double cpuLoad, int measurements) {
+ Duration duration = addCpuMeasurements((float)cpuLoad, measurements);
+ fixture.tester().clock().advance(duration.negated());
+ return addQueryRateMeasurements(measurements, t -> t == 0 ? 200.0 : 100.0); // Query traffic only
}
public void applyMemLoad(double memLoad, int measurements) {
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/persistence/ApplicationSerializerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/persistence/ApplicationSerializerTest.java
index bce10b999bb..c8dc0d97320 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/persistence/ApplicationSerializerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/persistence/ApplicationSerializerTest.java
@@ -8,6 +8,7 @@ import com.yahoo.config.provision.ClusterSpec;
import com.yahoo.config.provision.NodeResources;
import com.yahoo.vespa.hosted.provision.applications.Application;
import com.yahoo.vespa.hosted.provision.applications.Cluster;
+import com.yahoo.vespa.hosted.provision.applications.BcpGroupInfo;
import com.yahoo.vespa.hosted.provision.applications.ScalingEvent;
import com.yahoo.vespa.hosted.provision.applications.Status;
import com.yahoo.vespa.hosted.provision.autoscale.Autoscaling;
@@ -39,6 +40,7 @@ public class ApplicationSerializerTest {
true,
Autoscaling.empty(),
Autoscaling.empty(),
+ BcpGroupInfo.empty(),
List.of()));
var minResources = new NodeResources(1, 2, 3, 4);
clusters.add(new Cluster(ClusterSpec.Id.from("c2"),
@@ -61,6 +63,7 @@ public class ApplicationSerializerTest {
Instant.ofEpochMilli(5678L),
Load.zero(),
Load.one()),
+ new BcpGroupInfo(0.1, 0.2, 0.3),
List.of(new ScalingEvent(new ClusterResources(10, 5, minResources),
new ClusterResources(12, 6, minResources),
7L,
@@ -90,6 +93,7 @@ public class ApplicationSerializerTest {
assertEquals(originalCluster.required(), serializedCluster.required());
assertEquals(originalCluster.suggested(), serializedCluster.suggested());
assertEquals(originalCluster.target(), serializedCluster.target());
+ assertEquals(originalCluster.bcpGroupInfo(), serializedCluster.bcpGroupInfo());
assertEquals(originalCluster.scalingEvents(), serializedCluster.scalingEvents());
}
}