aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--config-lib/src/main/java/com/yahoo/config/ReferenceNode.java2
-rw-r--r--controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java4
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/BcpGroupInfo.java66
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/Cluster.java22
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java157
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Load.java4
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/ApplicationSerializer.java22
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java2
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingUsingBcpGroupInfoTest.java238
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Fixture.java7
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Loader.java8
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/persistence/ApplicationSerializerTest.java4
12 files changed, 463 insertions, 73 deletions
diff --git a/config-lib/src/main/java/com/yahoo/config/ReferenceNode.java b/config-lib/src/main/java/com/yahoo/config/ReferenceNode.java
index 2c668875f81..9c17ec3cd5b 100644
--- a/config-lib/src/main/java/com/yahoo/config/ReferenceNode.java
+++ b/config-lib/src/main/java/com/yahoo/config/ReferenceNode.java
@@ -68,7 +68,7 @@ public class ReferenceNode extends LeafNode<String> {
if (value == null) {
return value;
}
- StringBuffer buffer = new StringBuffer(value.trim());
+ StringBuilder buffer = new StringBuilder(value.trim());
if (buffer.length() > 0 && buffer.charAt(0) == '"') {
buffer.deleteCharAt(0);
}
diff --git a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java
index c48f6a34441..322c78aa7c1 100644
--- a/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java
+++ b/controller-server/src/main/java/com/yahoo/vespa/hosted/controller/maintenance/DeploymentMetricsMaintainer.java
@@ -25,8 +25,8 @@ import java.util.logging.Level;
import java.util.logging.Logger;
/**
- * Retrieve deployment metrics such as QPS and document count from the metric service and
- * update applications with this info.
+ * Retrieves deployment metrics such as QPS and document count from the metric service and
+ * updates applications with this info.
*
* @author smorgrav
* @author mpolden
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/BcpGroupInfo.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/BcpGroupInfo.java
new file mode 100644
index 00000000000..6b0ea8532be
--- /dev/null
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/BcpGroupInfo.java
@@ -0,0 +1,66 @@
+package com.yahoo.vespa.hosted.provision.applications;
+
+import java.util.Objects;
+
+/**
+ * When there are multiple deployments of an application in different regions,
+ * instances of the cluster across regions may form a "BCP group".
+ * By default the clusters in all production regions form such a group, but other arrangements
+ * may be specified in deployment.xml, see com.yahoo.config.application.api.Bcp.
+ *
+ * This contains metrics averaged over the other clusters in the group this belongs to,
+ * which is used to amend scaling decisions in this cluster when it has little traffic on its own.
+ *
+ * @author bratseth
+ */
+public class BcpGroupInfo {
+
+ private static final BcpGroupInfo empty = new BcpGroupInfo(0, 0, 0);
+
+ private final double queryRate;
+ private final double growthRateHeadroom;
+ private final double cpuCostPerQuery;
+
+ public BcpGroupInfo(double queryRate, double growthRateHeadroom, double cpuCostPerQuery) {
+ this.queryRate = queryRate;
+ this.growthRateHeadroom = growthRateHeadroom;
+ this.cpuCostPerQuery = cpuCostPerQuery;
+ }
+
+ /** Returns the average query rate (queries/second) of the other clusters in the group this belongs to. */
+ public double queryRate() { return queryRate; }
+
+ /** Returns the average growth rate headroom of the other clusters in the group this belongs to. */
+ public double growthRateHeadroom() { return growthRateHeadroom; }
+
+ /** Returns the average total cluster CPU cost per query of the other clusters in the group this belongs to. */
+ public double cpuCostPerQuery() { return cpuCostPerQuery; }
+
+ public boolean isEmpty() {
+ return queryRate == 0 && growthRateHeadroom == 0 && cpuCostPerQuery == 0;
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) return true;
+ if ( ! (o instanceof BcpGroupInfo other)) return false;
+ if ( other.queryRate != this.queryRate) return false;
+ if ( other.growthRateHeadroom != this.growthRateHeadroom) return false;
+ if ( other.cpuCostPerQuery != this.cpuCostPerQuery) return false;
+ return true;
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(queryRate, growthRateHeadroom, cpuCostPerQuery);
+ }
+
+ @Override
+ public String toString() {
+ return "BCP group info: " + queryRate + " q/s, " + growthRateHeadroom + " q/s headroom, " +
+ cpuCostPerQuery + " CPU cost per q/s";
+ }
+
+ public static BcpGroupInfo empty() { return empty; }
+
+}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/Cluster.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/Cluster.java
index 8da6bd6937b..ea4944c2bd5 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/Cluster.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/Cluster.java
@@ -33,6 +33,7 @@ public class Cluster {
private final boolean required;
private final Autoscaling suggested;
private final Autoscaling target;
+ private final BcpGroupInfo bcpGroupInfo;
/** The maxScalingEvents last scaling events of this, sorted by increasing time (newest last) */
private final List<ScalingEvent> scalingEvents;
@@ -45,6 +46,7 @@ public class Cluster {
boolean required,
Autoscaling suggested,
Autoscaling target,
+ BcpGroupInfo bcpGroupInfo,
List<ScalingEvent> scalingEvents) {
this.id = Objects.requireNonNull(id);
this.exclusive = exclusive;
@@ -58,6 +60,7 @@ public class Cluster {
this.target = Autoscaling.empty();
else
this.target = target;
+ this.bcpGroupInfo = Objects.requireNonNull(bcpGroupInfo);
this.scalingEvents = List.copyOf(scalingEvents);
}
@@ -77,7 +80,7 @@ public class Cluster {
/**
* Returns whether the resources of this cluster are required to be within the specified min and max.
- * Otherwise they may be adjusted by capacity policies.
+ * Otherwise, they may be adjusted by capacity policies.
*/
public boolean required() { return required; }
@@ -102,6 +105,9 @@ public class Cluster {
return true;
}
+ /** Returns info about the BCP group of clusters this belongs to. */
+ public BcpGroupInfo bcpGroupInfo() { return bcpGroupInfo; }
+
/** Returns the recent scaling events in this cluster */
public List<ScalingEvent> scalingEvents() { return scalingEvents; }
@@ -113,15 +119,19 @@ public class Cluster {
public Cluster withConfiguration(boolean exclusive, Capacity capacity) {
return new Cluster(id, exclusive,
capacity.minResources(), capacity.maxResources(), capacity.groupSize(), capacity.isRequired(),
- suggested, target, scalingEvents);
+ suggested, target, bcpGroupInfo, scalingEvents);
}
public Cluster withSuggested(Autoscaling suggested) {
- return new Cluster(id, exclusive, min, max, groupSize, required, suggested, target, scalingEvents);
+ return new Cluster(id, exclusive, min, max, groupSize, required, suggested, target, bcpGroupInfo, scalingEvents);
}
public Cluster withTarget(Autoscaling target) {
- return new Cluster(id, exclusive, min, max, groupSize, required, suggested, target, scalingEvents);
+ return new Cluster(id, exclusive, min, max, groupSize, required, suggested, target, bcpGroupInfo, scalingEvents);
+ }
+
+ public Cluster with(BcpGroupInfo bcpGroupInfo) {
+ return new Cluster(id, exclusive, min, max, groupSize, required, suggested, target, bcpGroupInfo, scalingEvents);
}
/** Add or update (based on "at" time) a scaling event */
@@ -135,7 +145,7 @@ public class Cluster {
scalingEvents.add(scalingEvent);
prune(scalingEvents);
- return new Cluster(id, exclusive, min, max, groupSize, required, suggested, target, scalingEvents);
+ return new Cluster(id, exclusive, min, max, groupSize, required, suggested, target, bcpGroupInfo, scalingEvents);
}
@Override
@@ -167,7 +177,7 @@ public class Cluster {
public static Cluster create(ClusterSpec.Id id, boolean exclusive, Capacity requested) {
return new Cluster(id, exclusive,
requested.minResources(), requested.maxResources(), requested.groupSize(), requested.isRequired(),
- Autoscaling.empty(), Autoscaling.empty(), List.of());
+ Autoscaling.empty(), Autoscaling.empty(), BcpGroupInfo.empty(), List.of());
}
/** The predicted time it will take to rescale this cluster. */
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java
index da74ad0b63b..4edcdbd3fa5 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java
@@ -30,6 +30,9 @@ public class ClusterModel {
/** Containers typically use more cpu right after generation change, so discard those metrics */
public static final Duration warmupDuration = Duration.ofMinutes(7);
+ /** If we have less than this query rate, we cannot be fully confident in our load data, which influences some decisions. */
+ public static final double queryRateGivingFullConfidence = 100.0;
+
static final double idealQueryCpuLoad = 0.8;
static final double idealWriteCpuLoad = 0.95;
@@ -48,8 +51,13 @@ public class ClusterModel {
private final Application application;
private final ClusterSpec clusterSpec;
private final Cluster cluster;
- /** The current nodes of this cluster, or empty if this models a new cluster not yet deployed */
+
+ /**
+ * The current active nodes of this cluster, including retired,
+ * or empty if this models a new cluster not yet deployed.
+ */
private final NodeList nodes;
+
private final Clock clock;
private final Duration scalingDuration;
private final ClusterTimeseries clusterTimeseries;
@@ -118,6 +126,14 @@ public class ClusterModel {
return adjustment;
}
+ public OptionalDouble cpuCostPerQuery() {
+ if (averageQueryRate().isEmpty()) return OptionalDouble.empty();
+ // TODO: Query rate should generally be sampled at the time where we see the peak resource usage
+ int fanOut = clusterSpec.type().isContainer() ? 1 : groupSize();
+ return OptionalDouble.of(peakLoad().cpu() * queryCpuFraction() * fanOut * nodes.not().retired().first().get().resources().vcpu()
+ / averageQueryRate().getAsDouble() / groupCount());
+ }
+
public boolean isStable(NodeRepository nodeRepository) {
// An autoscaling decision was recently made
if (hasScaledIn(Duration.ofMinutes(5)))
@@ -143,59 +159,12 @@ public class ClusterModel {
return true;
}
- private boolean hasScaledIn(Duration period) {
- return cluster.lastScalingEvent().map(event -> event.at()).orElse(Instant.MIN)
- .isAfter(clock.instant().minus(period));
- }
-
/** Returns the predicted duration of a rescaling of this cluster */
public Duration scalingDuration() { return scalingDuration; }
- public ClusterNodesTimeseries nodeTimeseries() { return nodeTimeseries; }
-
- public ClusterTimeseries clusterTimeseries() { return clusterTimeseries; }
-
- /**
- * Returns the predicted max query growth rate per minute as a fraction of the average traffic
- * in the scaling window.
- */
- public double maxQueryGrowthRate() {
- if (maxQueryGrowthRate != null) return maxQueryGrowthRate;
- return maxQueryGrowthRate = clusterTimeseries().maxQueryGrowthRate(scalingDuration(), clock);
- }
-
- /** Returns the average query rate in the scaling window as a fraction of the max observed query rate. */
- public double queryFractionOfMax() {
- if (queryFractionOfMax != null) return queryFractionOfMax;
- return queryFractionOfMax = clusterTimeseries().queryFractionOfMax(scalingDuration(), clock);
- }
-
- /** Returns the average query rate in the scaling window. */
- public OptionalDouble averageQueryRate() {
- if (averageQueryRate != null) return averageQueryRate;
- return averageQueryRate = clusterTimeseries().queryRate(scalingDuration(), clock);
- }
-
/** Returns the average of the peak load measurement in each dimension, from each node. */
- public Load peakLoad() { return nodeTimeseries().peakLoad(); }
-
- /** The number of nodes this cluster has, or will have if not deployed yet. */
- // TODO: Make this the deployed, not current count
- public int nodeCount() {
- if ( ! nodes.isEmpty()) return (int)nodes.stream().count();
- return cluster.minResources().nodes();
- }
-
- /** The number of groups this cluster has, or will have if not deployed yet. */
- // TODO: Make this the deployed, not current count
- public int groupCount() {
- if ( ! nodes.isEmpty()) return (int)nodes.stream().mapToInt(node -> node.allocation().get().membership().cluster().group().get().index()).distinct().count();
- return cluster.minResources().groups();
- }
-
- public int groupSize() {
- // ceil: If the division does not produce a whole number we assume some node is missing
- return (int)Math.ceil((double)nodeCount() / groupCount());
+ public Load peakLoad() {
+ return nodeTimeseries().peakLoad();
}
/** Returns the relative load adjustment accounting for redundancy in this. */
@@ -235,15 +204,88 @@ public class ClusterModel {
* if one of the nodes go down.
*/
public Load idealLoad() {
- return new Load(idealCpuLoad(), idealMemoryLoad(), idealDiskLoad()).divide(redundancyAdjustment());
+ var ideal = new Load(idealCpuLoad(), idealMemoryLoad(), idealDiskLoad()).divide(redundancyAdjustment());
+ if (! cluster.bcpGroupInfo().isEmpty()) {
+ // Do a weighted sum of the ideal "vote" based on local and bcp group info.
+ // This avoids any discontinuities with a near-zero local query rate.
+ double localInformationWeight = Math.min(1, averageQueryRate().orElse(0) /
+ Math.min(queryRateGivingFullConfidence, cluster.bcpGroupInfo().queryRate()));
+ Load bcpGroupIdeal = adjustQueryDependentIdealLoadByBcpGroupInfo(ideal);
+ ideal = ideal.multiply(localInformationWeight).add(bcpGroupIdeal.multiply(1 - localInformationWeight));
+ }
+ return ideal;
}
- public int nodesAdjustedForRedundancy(int nodes, int groups) {
+ /** Returns the instant this model was created. */
+ public Instant at() { return at;}
+
+ private Load adjustQueryDependentIdealLoadByBcpGroupInfo(Load ideal) {
+ double currentClusterTotalVcpuPerGroup = nodes.not().retired().first().get().resources().vcpu() * groupSize();
+
+ double targetQueryRateToHandle = cluster.bcpGroupInfo().queryRate() * cluster.bcpGroupInfo().growthRateHeadroom() * trafficShiftHeadroom();
+ double neededTotalVcpPerGroup = cluster.bcpGroupInfo().cpuCostPerQuery() * targetQueryRateToHandle / groupCount() +
+ ( 1 - queryCpuFraction()) * idealCpuLoad() *
+ (clusterSpec.type().isContainer() ? 1 : groupSize());
+
+ double cpuAdjustment = neededTotalVcpPerGroup / currentClusterTotalVcpuPerGroup;
+ return ideal.withCpu(peakLoad().cpu() / cpuAdjustment);
+ }
+
+ private boolean hasScaledIn(Duration period) {
+ return cluster.lastScalingEvent().map(event -> event.at()).orElse(Instant.MIN)
+ .isAfter(clock.instant().minus(period));
+ }
+
+ private ClusterNodesTimeseries nodeTimeseries() { return nodeTimeseries; }
+
+ private ClusterTimeseries clusterTimeseries() { return clusterTimeseries; }
+
+ /**
+ * Returns the predicted max query growth rate per minute as a fraction of the average traffic
+ * in the scaling window.
+ */
+ private double maxQueryGrowthRate() {
+ if (maxQueryGrowthRate != null) return maxQueryGrowthRate;
+ return maxQueryGrowthRate = clusterTimeseries().maxQueryGrowthRate(scalingDuration(), clock);
+ }
+
+ /** Returns the average query rate in the scaling window as a fraction of the max observed query rate. */
+ private double queryFractionOfMax() {
+ if (queryFractionOfMax != null) return queryFractionOfMax;
+ return queryFractionOfMax = clusterTimeseries().queryFractionOfMax(scalingDuration(), clock);
+ }
+
+ /** Returns the average query rate in the scaling window. */
+ private OptionalDouble averageQueryRate() {
+ if (averageQueryRate != null) return averageQueryRate;
+ return averageQueryRate = clusterTimeseries().queryRate(scalingDuration(), clock);
+ }
+
+ /** The number of nodes this cluster has, or will have if not deployed yet. */
+ // TODO: Make this the deployed, not current count
+ private int nodeCount() {
+ if ( ! nodes.isEmpty()) return (int)nodes.stream().count();
+ return cluster.minResources().nodes();
+ }
+
+ /** The number of groups this cluster has, or will have if not deployed yet. */
+ // TODO: Make this the deployed, not current count
+ private int groupCount() {
+ if ( ! nodes.isEmpty()) return (int)nodes.stream().mapToInt(node -> node.allocation().get().membership().cluster().group().get().index()).distinct().count();
+ return cluster.minResources().groups();
+ }
+
+ private int groupSize() {
+ // ceil: If the division does not produce a whole number we assume some node is missing
+ return (int)Math.ceil((double)nodeCount() / groupCount());
+ }
+
+ private int nodesAdjustedForRedundancy(int nodes, int groups) {
int groupSize = (int)Math.ceil((double)nodes / groups);
return nodes > 1 ? (groups == 1 ? nodes - 1 : nodes - groupSize) : nodes;
}
- public int groupsAdjustedForRedundancy(int nodes, int groups) {
+ private int groupsAdjustedForRedundancy(int nodes, int groups) {
return nodes > 1 ? (groups == 1 ? 1 : groups - 1) : groups;
}
@@ -258,9 +300,6 @@ public class ClusterModel {
(1 - queryCpuFraction) * idealWriteCpuLoad;
}
- /** Returns the instant this model was created. */
- public Instant at() { return at;}
-
/** Returns the headroom for growth during organic traffic growth as a multiple of current resources. */
private double growthRateHeadroom() {
if ( ! zone.environment().isProduction()) return 1;
@@ -280,7 +319,7 @@ public class ClusterModel {
if ( ! zone.environment().isProduction()) return 1;
double trafficShiftHeadroom;
if (application.status().maxReadShare() == 0) // No traffic fraction data
- trafficShiftHeadroom = 2.0; // assume we currently get half of the global share of traffic
+ trafficShiftHeadroom = 2.0; // assume we currently get half of the max possible share of traffic
else if (application.status().currentReadShare() == 0)
trafficShiftHeadroom = 1/application.status().maxReadShare();
else
@@ -294,11 +333,11 @@ public class ClusterModel {
* with high confidence to avoid large adjustments caused by random noise due to low traffic numbers.
*/
private double adjustByConfidence(double headroom) {
- return ( (headroom -1 ) * Math.min(1, averageQueryRate().orElse(0) / 100.0) ) + 1;
+ return ( (headroom -1 ) * Math.min(1, averageQueryRate().orElse(0) / queryRateGivingFullConfidence) ) + 1;
}
/** The estimated fraction of cpu usage which goes to processing queries vs. writes */
- public double queryCpuFraction() {
+ private double queryCpuFraction() {
OptionalDouble writeRate = clusterTimeseries().writeRate(scalingDuration(), clock);
if (averageQueryRate().orElse(0) == 0 && writeRate.orElse(0) == 0) return queryCpuFraction(0.5);
return queryCpuFraction(averageQueryRate().orElse(0) / (averageQueryRate().orElse(0) + writeRate.orElse(0)));
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Load.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Load.java
index 6ab5ff731d3..a2fa6e63922 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Load.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Load.java
@@ -30,6 +30,10 @@ public class Load {
public double memory() { return memory; }
public double disk() { return disk; }
+ public Load withCpu(double cpu) { return new Load(cpu, memory, disk); }
+ public Load withMemory(double memory) { return new Load(cpu, memory, disk); }
+ public Load withDisk(double disk) { return new Load(cpu, memory, disk); }
+
public Load add(Load other) {
return join(other, (a, b) -> a + b);
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/ApplicationSerializer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/ApplicationSerializer.java
index 469cabc4ee4..1b73dee8b6c 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/ApplicationSerializer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/ApplicationSerializer.java
@@ -12,6 +12,7 @@ import com.yahoo.slime.Slime;
import com.yahoo.slime.SlimeUtils;
import com.yahoo.vespa.hosted.provision.applications.Application;
import com.yahoo.vespa.hosted.provision.applications.Cluster;
+import com.yahoo.vespa.hosted.provision.applications.BcpGroupInfo;
import com.yahoo.vespa.hosted.provision.applications.ScalingEvent;
import com.yahoo.vespa.hosted.provision.applications.Status;
import com.yahoo.vespa.hosted.provision.autoscale.Autoscaling;
@@ -53,6 +54,10 @@ public class ApplicationSerializer {
private static final String groupSizeKey = "groupSize";
private static final String requiredKey = "required";
private static final String suggestedKey = "suggested";
+ private static final String bcpGroupInfoKey = "bcpGroupInfo";
+ private static final String queryRateKey = "queryRateKey";
+ private static final String growthRateHeadroomKey = "growthRateHeadroomKey";
+ private static final String cpuCostPerQueryKey = "cpuCostPerQueryKey";
private static final String resourcesKey = "resources";
private static final String targetKey = "target";
private static final String nodesKey = "nodes";
@@ -129,6 +134,8 @@ public class ApplicationSerializer {
clusterObject.setBool(requiredKey, cluster.required());
toSlime(cluster.suggested(), clusterObject.setObject(suggestedKey));
toSlime(cluster.target(), clusterObject.setObject(targetKey));
+ if (! cluster.bcpGroupInfo().isEmpty())
+ toSlime(cluster.bcpGroupInfo(), clusterObject.setObject(bcpGroupInfoKey));
scalingEventsToSlime(cluster.scalingEvents(), clusterObject.setArray(scalingEventsKey));
}
@@ -141,6 +148,7 @@ public class ApplicationSerializer {
clusterObject.field(requiredKey).asBool(),
autoscalingFromSlime(clusterObject.field(suggestedKey), clusterObject.field("nonExisting")),
autoscalingFromSlime(clusterObject.field(targetKey), clusterObject.field(autoscalingStatusObjectKey)),
+ bcpGroupInfoFromSlime(clusterObject.field(bcpGroupInfoKey)),
scalingEventsFromSlime(clusterObject.field(scalingEventsKey)));
}
@@ -222,6 +230,20 @@ public class ApplicationSerializer {
loadFromSlime(autoscalingObject.field(idealKey)));
}
+ private static void toSlime(BcpGroupInfo bcpGroupInfo, Cursor bcpGroupInfoObject) {
+ if (bcpGroupInfo.isEmpty()) return;
+ bcpGroupInfoObject.setDouble(queryRateKey, bcpGroupInfo.queryRate());
+ bcpGroupInfoObject.setDouble(growthRateHeadroomKey, bcpGroupInfo.growthRateHeadroom());
+ bcpGroupInfoObject.setDouble(cpuCostPerQueryKey, bcpGroupInfo.cpuCostPerQuery());
+ }
+
+ private static BcpGroupInfo bcpGroupInfoFromSlime(Inspector bcpGroupInfoObject) {
+ if ( ! bcpGroupInfoObject.valid()) return BcpGroupInfo.empty();
+ return new BcpGroupInfo(bcpGroupInfoObject.field(queryRateKey).asDouble(),
+ bcpGroupInfoObject.field(growthRateHeadroomKey).asDouble(),
+ bcpGroupInfoObject.field(cpuCostPerQueryKey).asDouble());
+ }
+
private static void scalingEventsToSlime(List<ScalingEvent> scalingEvents, Cursor eventArray) {
scalingEvents.forEach(event -> toSlime(event, eventArray.addObject()));
}
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java
index 05d0822758d..19c6ce16674 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java
@@ -3,7 +3,6 @@ package com.yahoo.vespa.hosted.provision.autoscale;
import com.yahoo.config.provision.ApplicationId;
import com.yahoo.config.provision.Capacity;
-import com.yahoo.config.provision.Cloud;
import com.yahoo.config.provision.ClusterResources;
import com.yahoo.config.provision.ClusterSpec;
import com.yahoo.config.provision.Flavor;
@@ -135,6 +134,7 @@ class AutoscalingTester {
cluster.required(),
cluster.suggested(),
cluster.target(),
+ cluster.bcpGroupInfo(),
List.of()); // Remove scaling events
cluster = cluster.with(ScalingEvent.create(cluster.minResources(), cluster.minResources(),
0,
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingUsingBcpGroupInfoTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingUsingBcpGroupInfoTest.java
new file mode 100644
index 00000000000..0bd94872557
--- /dev/null
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingUsingBcpGroupInfoTest.java
@@ -0,0 +1,238 @@
+package com.yahoo.vespa.hosted.provision.autoscale;
+
+import com.yahoo.config.provision.Capacity;
+import com.yahoo.config.provision.ClusterResources;
+import com.yahoo.config.provision.ClusterSpec;
+import com.yahoo.config.provision.NodeResources;
+import com.yahoo.vespa.hosted.provision.applications.BcpGroupInfo;
+import org.junit.Test;
+
+import java.time.Duration;
+import java.util.Optional;
+
+/**
+ * Tests autoscaling using information from the BCP group this cluster deployment
+ * is part of to supplement local data when the local deployment lacks sufficient traffic.
+ *
+ * @author bratseth
+ */
+public class AutoscalingUsingBcpGroupInfoTest {
+
+ /** Tests with varying BCP group info parameters. */
+ @Test
+ public void test_autoscaling_single_content_group() {
+ var fixture = AutoscalingTester.fixture().awsProdSetup(true).build();
+
+ fixture.tester().clock().advance(Duration.ofDays(2));
+ fixture.store(new BcpGroupInfo(100, 1.1, 0.3));
+ fixture.loader().addCpuMeasurements(0.7f, 10);
+ fixture.tester().assertResources("Scaling up cpu using bcp group cpu info",
+ 8, 1, 4.0, 7.6, 37.8,
+ fixture.autoscale());
+
+ // Higher query rate (mem and disk changes are due to being assigned larger hosts where we get less overhead share
+ fixture.tester().clock().advance(Duration.ofDays(2));
+ fixture.store(new BcpGroupInfo(200, 1.1, 0.3));
+ fixture.loader().addCpuMeasurements(0.7f, 10);
+ fixture.tester().assertResources("Scaling up cpu using bcp group cpu info",
+ 8, 1, 8.0, 7.4, 32.8,
+ fixture.autoscale());
+
+ // Higher headroom
+ fixture.tester().clock().advance(Duration.ofDays(2));
+ fixture.store(new BcpGroupInfo(100, 1.3, 0.3));
+ fixture.loader().addCpuMeasurements(0.7f, 10);
+ fixture.tester().assertResources("Scaling up cpu using bcp group cpu info",
+ 9, 1, 4.2, 6.6, 33.1,
+ fixture.autoscale());
+
+ // Higher per query cost
+ fixture.tester().clock().advance(Duration.ofDays(2));
+ fixture.store(new BcpGroupInfo(100, 1.1, 0.45));
+ fixture.loader().addCpuMeasurements(0.7f, 10);
+ fixture.tester().assertResources("Scaling up cpu using bcp group cpu info",
+ 9, 1, 5.4, 6.6, 33.1,
+ fixture.autoscale());
+ }
+
+ /** Tests with varying BCP group info parameters. */
+ @Test
+ public void test_autoscaling_multiple_content_groups() {
+ var min = new ClusterResources(3, 3,
+ new NodeResources(1, 4, 10, 1, NodeResources.DiskSpeed.any));
+ var max = new ClusterResources(21, 3,
+ new NodeResources(100, 1000, 1000, 1, NodeResources.DiskSpeed.any));
+ var fixture = AutoscalingTester.fixture()
+ .awsProdSetup(true)
+ .initialResources(Optional.of(new ClusterResources(9, 3, new NodeResources(2, 16, 75, 1))))
+ .capacity(Capacity.from(min, max))
+ .build();
+
+ fixture.tester().clock().advance(Duration.ofDays(2));
+ fixture.store(new BcpGroupInfo(100, 1.1, 0.3));
+ fixture.loader().addCpuMeasurements(0.7f, 10);
+ fixture.tester().assertResources("Scaling up cpu using bcp group cpu info",
+ 3, 3, 10.5, 42.3, 187.0,
+ fixture.autoscale());
+
+ // Higher query rate (mem and disk changes are due to being assigned larger hosts where we get less overhead share
+ fixture.tester().clock().advance(Duration.ofDays(2));
+ fixture.store(new BcpGroupInfo(200, 1.1, 0.3));
+ fixture.loader().addCpuMeasurements(0.7f, 10);
+ fixture.tester().assertResources("Scaling up cpu using bcp group cpu info",
+ 3, 3, 20.9, 42.3, 178.0,
+ fixture.autoscale());
+
+ // Higher headroom
+ fixture.tester().clock().advance(Duration.ofDays(2));
+ fixture.store(new BcpGroupInfo(100, 1.3, 0.3));
+ fixture.loader().addCpuMeasurements(0.7f, 10);
+ fixture.tester().assertResources("Scaling up cpu using bcp group cpu info",
+ 3, 3, 12.4, 42.3, 187.0,
+ fixture.autoscale());
+
+ // Higher per query cost
+ fixture.tester().clock().advance(Duration.ofDays(2));
+ fixture.store(new BcpGroupInfo(100, 1.1, 0.45));
+ fixture.loader().addCpuMeasurements(0.7f, 10);
+ fixture.tester().assertResources("Scaling up cpu using bcp group cpu info",
+ 3, 3, 15.7, 42.3, 187.0,
+ fixture.autoscale());
+ }
+
+ /**
+ * Tests with varying BCP group info parameters for containers.
+ * Differences from content
+ * - No host sharing.
+ * - Memory and disk is independent of cluster size.
+ */
+ @Test
+ public void test_autoscaling_container() {
+ var fixture = AutoscalingTester.fixture().clusterType(ClusterSpec.Type.container).awsProdSetup(true).build();
+
+ fixture.tester().clock().advance(Duration.ofDays(2));
+ fixture.store(new BcpGroupInfo(100, 1.1, 0.3));
+ fixture.loader().addCpuMeasurements(0.7f, 10);
+ fixture.tester().assertResources("Scaling up cpu using bcp group cpu info",
+ 8, 1, 4.0, 16.0, 40.8,
+ fixture.autoscale());
+
+ // Higher query rate (mem and disk changes are due to being assigned larger hosts where we get less overhead share
+ fixture.tester().clock().advance(Duration.ofDays(2));
+ fixture.store(new BcpGroupInfo(200, 1.1, 0.3));
+ fixture.loader().addCpuMeasurements(0.7f, 10);
+ fixture.tester().assertResources("Scaling up cpu using bcp group cpu info",
+ 8, 1, 8.0, 16.0, 40.8,
+ fixture.autoscale());
+
+ // Higher headroom
+ fixture.tester().clock().advance(Duration.ofDays(2));
+ fixture.store(new BcpGroupInfo(100, 1.3, 0.3));
+ fixture.loader().addCpuMeasurements(0.7f, 10);
+ fixture.tester().assertResources("Scaling up cpu using bcp group cpu info",
+ 5, 1, 8.0, 16.0, 40.8,
+ fixture.autoscale());
+
+ // Higher per query cost
+ fixture.tester().clock().advance(Duration.ofDays(2));
+ fixture.store(new BcpGroupInfo(100, 1.1, 0.45));
+ fixture.loader().addCpuMeasurements(0.7f, 10);
+ fixture.tester().assertResources("Scaling up cpu using bcp group cpu info",
+ 6, 1, 8.0, 16.0, 40.8,
+ fixture.autoscale());
+ }
+
+ @Test
+ public void test_autoscaling_single_content_group_with_some_local_traffic() {
+ var fixture = AutoscalingTester.fixture().awsProdSetup(true).build();
+
+ // Baseline: No local traffic, group traffic indicates much higher cpu usage than local
+ fixture.tester().clock().advance(Duration.ofDays(2));
+ fixture.store(new BcpGroupInfo(200, 1.3, 0.45));
+ fixture.loader().addCpuMeasurements(0.7f, 10);
+ fixture.tester().assertResources("Scaling up cpu using bcp group cpu info",
+ 8, 1, 14.2, 7.4, 32.8,
+ fixture.autoscale());
+
+ // Some local traffic
+ fixture.tester().clock().advance(Duration.ofDays(2));
+ fixture.store(new BcpGroupInfo(200, 1.3, 0.45));
+ Duration duration1 = fixture.loader().addCpuMeasurements(0.7f, 10);
+ fixture.tester().clock().advance(duration1.negated());
+ fixture.loader().addQueryRateMeasurements(10, __ -> 10.0);
+ fixture.tester().assertResources("Scaling up cpu using bcp group cpu info",
+ 8, 1, 6.9, 7.6, 37.8,
+ fixture.autoscale());
+
+ // Enough local traffic to get half the votes
+ fixture.tester().clock().advance(Duration.ofDays(2));
+ fixture.store(new BcpGroupInfo(200, 1.3, 0.45));
+ Duration duration2 = fixture.loader().addCpuMeasurements(0.7f, 10);
+ fixture.tester().clock().advance(duration2.negated());
+ fixture.loader().addQueryRateMeasurements(10, __ -> 50.0);
+ fixture.tester().assertResources("Scaling up cpu using bcp group cpu info",
+ 7, 1, 3.5, 8.9, 55.5,
+ fixture.autoscale());
+
+ // Mostly local
+ fixture.tester().clock().advance(Duration.ofDays(2));
+ fixture.store(new BcpGroupInfo(200, 1.3, 0.45));
+ Duration duration3 = fixture.loader().addCpuMeasurements(0.7f, 10);
+ fixture.tester().clock().advance(duration3.negated());
+ fixture.loader().addQueryRateMeasurements(10, __ -> 90.0);
+ fixture.tester().assertResources("Scaling up cpu using bcp group cpu info",
+ 7, 1, 2.7, 8.9, 55.5,
+ fixture.autoscale());
+
+ // Local only
+ fixture.tester().clock().advance(Duration.ofDays(2));
+ fixture.store(new BcpGroupInfo(200, 1.3, 0.45));
+ Duration duration4 = fixture.loader().addCpuMeasurements(0.7f, 10);
+ fixture.tester().clock().advance(duration4.negated());
+ fixture.loader().addQueryRateMeasurements(10, __ -> 100.0);
+ fixture.tester().assertResources("Scaling up cpu using bcp group cpu info",
+ 7, 1, 2.6, 8.9, 55.5,
+ fixture.autoscale());
+
+ // No group info, should be the same as the above
+ fixture.tester().clock().advance(Duration.ofDays(2));
+ fixture.store(BcpGroupInfo.empty());
+ Duration duration5 = fixture.loader().addCpuMeasurements(0.7f, 10);
+ fixture.tester().clock().advance(duration5.negated());
+ fixture.loader().addQueryRateMeasurements(10, __ -> 100.0);
+ fixture.tester().assertResources("Scaling up cpu using bcp group cpu info",
+ 7, 1, 2.6, 8.9, 55.5,
+ fixture.autoscale());
+
+ // 40 query rate, no group info (for reference to the below)
+ fixture.tester().clock().advance(Duration.ofDays(2));
+ fixture.store(BcpGroupInfo.empty());
+ Duration duration6 = fixture.loader().addCpuMeasurements(0.7f, 10);
+ fixture.tester().clock().advance(duration6.negated());
+ fixture.loader().addQueryRateMeasurements(10, __ -> 40.0);
+ fixture.tester().assertResources("Scaling up cpu using bcp group cpu info",
+ 6, 1, 2.2, 10.6, 66.5,
+ fixture.autoscale());
+
+ // Local query rate is too low but global is even lower so disregard it, giving the same as above
+ fixture.tester().clock().advance(Duration.ofDays(2));
+ fixture.store(new BcpGroupInfo(200/40.0, 1.3, 0.45*40.0));
+ Duration duration7 = fixture.loader().addCpuMeasurements(0.7f, 10);
+ fixture.tester().clock().advance(duration7.negated());
+ fixture.loader().addQueryRateMeasurements(10, __ -> 40.0);
+ fixture.tester().assertResources("Scaling up cpu using bcp group cpu info",
+ 6, 1, 2.2, 10.6, 66.5,
+ fixture.autoscale());
+
+ // Local query rate is too low to be fully confident, and so is global but as it is slightly larger, incorporate it slightly
+ fixture.tester().clock().advance(Duration.ofDays(2));
+ fixture.store(new BcpGroupInfo(200/4.0, 1.3, 0.45*4.0));
+ Duration duration8 = fixture.loader().addCpuMeasurements(0.7f, 10);
+ fixture.tester().clock().advance(duration8.negated());
+ fixture.loader().addQueryRateMeasurements(10, __ -> 40.0);
+ fixture.tester().assertResources("Scaling up cpu using bcp group cpu info",
+ 7, 1, 2.2, 8.9, 55.5,
+ fixture.autoscale());
+ }
+
+}
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Fixture.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Fixture.java
index 1e1e00a10db..5caf50a4e83 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Fixture.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Fixture.java
@@ -21,6 +21,7 @@ import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeList;
import com.yahoo.vespa.hosted.provision.applications.Application;
import com.yahoo.vespa.hosted.provision.applications.Cluster;
+import com.yahoo.vespa.hosted.provision.applications.BcpGroupInfo;
import com.yahoo.vespa.hosted.provision.autoscale.awsnodes.AwsHostResourcesCalculatorImpl;
import com.yahoo.vespa.hosted.provision.autoscale.awsnodes.AwsNodeTypes;
import com.yahoo.vespa.hosted.provision.provisioning.HostResourcesCalculator;
@@ -132,6 +133,12 @@ public class Fixture {
tester.nodeRepository().applications().put(application, tester.nodeRepository().applications().lock(applicationId));
}
+ public void store(BcpGroupInfo bcpGroupInfo) {
+ var application = application();
+ application = application.with(application.cluster(clusterId()).get().with(bcpGroupInfo));
+ tester.nodeRepository().applications().put(application, tester.nodeRepository().applications().lock(applicationId));
+ }
+
public static class Builder {
ApplicationId application = AutoscalingTester.applicationId("application1");
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Loader.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Loader.java
index 10c8c7434b1..a104f0b1bc8 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Loader.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Loader.java
@@ -79,10 +79,10 @@ public class Loader {
return Duration.between(initialTime, fixture.tester().clock().instant());
}
- public void applyCpuLoad(double cpuLoad, int measurements) {
- addCpuMeasurements((float)cpuLoad, measurements);
- fixture.tester().clock().advance(samplingInterval.negated().multipliedBy(measurements));
- addQueryRateMeasurements(measurements, t -> t == 0 ? 200.0 : 100.0); // Query traffic only
+ public Duration applyCpuLoad(double cpuLoad, int measurements) {
+ Duration duration = addCpuMeasurements((float)cpuLoad, measurements);
+ fixture.tester().clock().advance(duration.negated());
+ return addQueryRateMeasurements(measurements, t -> t == 0 ? 200.0 : 100.0); // Query traffic only
}
public void applyMemLoad(double memLoad, int measurements) {
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/persistence/ApplicationSerializerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/persistence/ApplicationSerializerTest.java
index bce10b999bb..c8dc0d97320 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/persistence/ApplicationSerializerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/persistence/ApplicationSerializerTest.java
@@ -8,6 +8,7 @@ import com.yahoo.config.provision.ClusterSpec;
import com.yahoo.config.provision.NodeResources;
import com.yahoo.vespa.hosted.provision.applications.Application;
import com.yahoo.vespa.hosted.provision.applications.Cluster;
+import com.yahoo.vespa.hosted.provision.applications.BcpGroupInfo;
import com.yahoo.vespa.hosted.provision.applications.ScalingEvent;
import com.yahoo.vespa.hosted.provision.applications.Status;
import com.yahoo.vespa.hosted.provision.autoscale.Autoscaling;
@@ -39,6 +40,7 @@ public class ApplicationSerializerTest {
true,
Autoscaling.empty(),
Autoscaling.empty(),
+ BcpGroupInfo.empty(),
List.of()));
var minResources = new NodeResources(1, 2, 3, 4);
clusters.add(new Cluster(ClusterSpec.Id.from("c2"),
@@ -61,6 +63,7 @@ public class ApplicationSerializerTest {
Instant.ofEpochMilli(5678L),
Load.zero(),
Load.one()),
+ new BcpGroupInfo(0.1, 0.2, 0.3),
List.of(new ScalingEvent(new ClusterResources(10, 5, minResources),
new ClusterResources(12, 6, minResources),
7L,
@@ -90,6 +93,7 @@ public class ApplicationSerializerTest {
assertEquals(originalCluster.required(), serializedCluster.required());
assertEquals(originalCluster.suggested(), serializedCluster.suggested());
assertEquals(originalCluster.target(), serializedCluster.target());
+ assertEquals(originalCluster.bcpGroupInfo(), serializedCluster.bcpGroupInfo());
assertEquals(originalCluster.scalingEvents(), serializedCluster.scalingEvents());
}
}