summaryrefslogtreecommitdiffstats
path: root/node-repository
diff options
context:
space:
mode:
authorValerij Fredriksen <freva@users.noreply.github.com>2021-03-18 13:27:14 +0100
committerGitHub <noreply@github.com>2021-03-18 13:27:14 +0100
commit7c5948aceceec2a3c279755622876932fdf5a3ed (patch)
tree1801a9db90991b40f6f2ff27f609d02f049e3ce0 /node-repository
parent2e0ad58bb9bbb5cc24289acff25046a5b9442e2b (diff)
parent95a73e1587180eee3a272aed8a02e865943666cc (diff)
Merge pull request #17030 from vespa-engine/bratseth/cluster-model
Bratseth/cluster model
Diffstat (limited to 'node-repository')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/Cluster.java26
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java66
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java173
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterNodesTimeseries.java4
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java8
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java2
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Resource.java6
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTarget.java74
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/ApplicationSerializer.java48
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java1
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java6
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModelTest.java (renamed from node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTargetTest.java)45
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java5
13 files changed, 259 insertions, 205 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/Cluster.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/Cluster.java
index 69d7cec4007..59b70ff1ef0 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/Cluster.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/Cluster.java
@@ -129,32 +129,6 @@ public class Cluster {
return new Cluster(id, exclusive, min, max, suggested, target, scalingEvents, autoscalingStatus);
}
- /** The predicted duration of a rescaling of this cluster */
- public Duration scalingDuration(ClusterSpec clusterSpec) {
- int completedEventCount = 0;
- Duration totalDuration = Duration.ZERO;
- for (ScalingEvent event : scalingEvents()) {
- if (event.duration().isEmpty()) continue;
- completedEventCount++;
- totalDuration = totalDuration.plus(event.duration().get());
- }
-
- if (completedEventCount == 0) { // Use defaults
- if (clusterSpec.isStateful()) return Duration.ofHours(12);
- return Duration.ofMinutes(10);
- }
- else {
- Duration predictedDuration = totalDuration.dividedBy(completedEventCount);
-
- // TODO: Remove when we have reliable completion for content clusters
- if (clusterSpec.isStateful() && predictedDuration.minus(Duration.ofHours(12)).isNegative())
- return Duration.ofHours(12);
-
- if (predictedDuration.minus(Duration.ofMinutes(5)).isNegative()) return Duration.ofMinutes(5); // minimum
- return predictedDuration;
- }
- }
-
@Override
public int hashCode() { return id.hashCode(); }
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java
index ac3430fecf9..9791aabf7b4 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java
@@ -59,35 +59,25 @@ public class Autoscaler {
}
private Advice autoscale(Application application, Cluster cluster, NodeList clusterNodes, Limits limits) {
- if ( ! stable(clusterNodes, nodeRepository))
- return Advice.none("Cluster change in progress");
+ ClusterModel clusterModel = new ClusterModel(application, cluster, clusterNodes, metricsDb, nodeRepository.clock());
- Duration scalingWindow = cluster.scalingDuration(clusterNodes.clusterSpec());
- if (scaledIn(scalingWindow, cluster))
- return Advice.dontScale("Won't autoscale now: Less than " + scalingWindow + " since last resource change");
+ if ( ! clusterIsStable(clusterNodes, nodeRepository))
+ return Advice.none("Cluster change in progress");
- var clusterNodesTimeseries = new ClusterNodesTimeseries(scalingWindow, cluster, clusterNodes, metricsDb);
- var currentAllocation = new AllocatableClusterResources(clusterNodes.asList(), nodeRepository, cluster.exclusive());
+ if (scaledIn(clusterModel.scalingDuration(), cluster))
+ return Advice.dontScale("Won't autoscale now: Less than " + clusterModel.scalingDuration() + " since last resource change");
- int measurementsPerNode = clusterNodesTimeseries.measurementsPerNode();
- if (measurementsPerNode < minimumMeasurementsPerNode(scalingWindow))
+ if (clusterModel.nodeTimeseries().measurementsPerNode() < minimumMeasurementsPerNode(clusterModel.scalingDuration()))
return Advice.none("Collecting more data before making new scaling decisions: Need to measure for " +
- scalingWindow + " since the last resource change completed");
+ clusterModel.scalingDuration() + " since the last resource change completed");
- int nodesMeasured = clusterNodesTimeseries.nodesMeasured();
- if (nodesMeasured != clusterNodes.size())
+ if (clusterModel.nodeTimeseries().nodesMeasured() != clusterNodes.size())
return Advice.none("Collecting more data before making new scaling decisions: " +
- "Have measurements from " + nodesMeasured + " nodes, but require from " + clusterNodes.size());
+ "Have measurements from " + clusterModel.nodeTimeseries().nodesMeasured() +
+ " nodes, but require from " + clusterNodes.size());
-
- var scalingDuration = cluster.scalingDuration(clusterNodes.clusterSpec());
- var clusterTimeseries = metricsDb.getClusterTimeseries(application.id(), cluster.id());
- var target = ResourceTarget.idealLoad(scalingDuration,
- clusterTimeseries,
- clusterNodesTimeseries,
- currentAllocation,
- application,
- nodeRepository.clock());
+ var currentAllocation = new AllocatableClusterResources(clusterNodes.asList(), nodeRepository, cluster.exclusive());
+ var target = ResourceTarget.idealLoad(clusterModel, currentAllocation);
Optional<AllocatableClusterResources> bestAllocation =
allocationOptimizer.findBestAllocation(target, currentAllocation, limits);
@@ -97,13 +87,27 @@ public class Autoscaler {
if (similar(bestAllocation.get().realResources(), currentAllocation.realResources()))
return Advice.dontScale("Cluster is ideally scaled within configured limits");
- if (isDownscaling(bestAllocation.get(), currentAllocation) && scaledIn(scalingWindow.multipliedBy(3), cluster))
- return Advice.dontScale("Waiting " + scalingWindow.multipliedBy(3) +
+ if (isDownscaling(bestAllocation.get(), currentAllocation) && scaledIn(clusterModel.scalingDuration().multipliedBy(3), cluster))
+ return Advice.dontScale("Waiting " + clusterModel.scalingDuration().multipliedBy(3) +
" since the last change before reducing resources");
return Advice.scaleTo(bestAllocation.get().advertisedResources());
}
+ public static boolean clusterIsStable(NodeList clusterNodes, NodeRepository nodeRepository) {
+ // The cluster is processing recent changes
+ if (clusterNodes.stream().anyMatch(node -> node.status().wantToRetire() ||
+ node.allocation().get().membership().retired() ||
+ node.allocation().get().isRemovable()))
+ return false;
+
+ // A deployment is ongoing
+ if (nodeRepository.nodes().list(Node.State.reserved).owner(clusterNodes.first().get().allocation().get().owner()).size() > 0)
+ return false;
+
+ return true;
+ }
+
/** Returns true if both total real resources and total cost are similar */
public static boolean similar(ClusterResources a, ClusterResources b) {
return similar(a.cost(), b.cost(), costDifferenceWorthReallocation) &&
@@ -143,20 +147,6 @@ public class Autoscaler {
return (int)minimumMeasurements;
}
- public static boolean stable(NodeList nodes, NodeRepository nodeRepository) {
- // The cluster is processing recent changes
- if (nodes.stream().anyMatch(node -> node.status().wantToRetire() ||
- node.allocation().get().membership().retired() ||
- node.allocation().get().isRemovable()))
- return false;
-
- // A deployment is ongoing
- if (nodeRepository.nodes().list(Node.State.reserved).owner(nodes.first().get().allocation().get().owner()).size() > 0)
- return false;
-
- return true;
- }
-
public static class Advice {
private final boolean present;
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java
new file mode 100644
index 00000000000..4fb91e8592e
--- /dev/null
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java
@@ -0,0 +1,173 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.vespa.hosted.provision.autoscale;
+
+import com.yahoo.vespa.hosted.provision.NodeList;
+import com.yahoo.vespa.hosted.provision.applications.Application;
+import com.yahoo.vespa.hosted.provision.applications.Cluster;
+import com.yahoo.vespa.hosted.provision.applications.ScalingEvent;
+
+import java.time.Clock;
+import java.time.Duration;
+import java.util.OptionalDouble;
+
+/**
+ * A cluster with its associated metrics which allows prediction about its future behavior.
+ * For single-threaded, short-term usage.
+ *
+ * @author bratseth
+ */
+public class ClusterModel {
+
+ static final double idealQueryCpuLoad = 0.8;
+ static final double idealWriteCpuLoad = 0.95;
+ static final double idealMemoryLoad = 0.7;
+ static final double idealDiskLoad = 0.6;
+
+ private final Application application;
+ private final Cluster cluster;
+ private final NodeList nodes;
+ private final MetricsDb metricsDb;
+ private final Clock clock;
+ private final Duration scalingDuration;
+
+ // Lazily initialized members
+ private Double queryFractionOfMax = null;
+ private Double maxQueryGrowthRate = null;
+ private ClusterNodesTimeseries nodeTimeseries = null;
+ private ClusterTimeseries clusterTimeseries = null;
+
+ public ClusterModel(Application application,
+ Cluster cluster,
+ NodeList clusterNodes,
+ MetricsDb metricsDb,
+ Clock clock) {
+ this.application = application;
+ this.cluster = cluster;
+ this.nodes = clusterNodes;
+ this.metricsDb = metricsDb;
+ this.clock = clock;
+ this.scalingDuration = computeScalingDuration(cluster, clusterNodes);
+ }
+
+ /** For testing */
+ ClusterModel(Application application,
+ Cluster cluster,
+ Clock clock,
+ Duration scalingDuration,
+ ClusterTimeseries clusterTimeseries) {
+ this.application = application;
+ this.cluster = cluster;
+ this.nodes = null;
+ this.metricsDb = null;
+ this.clock = clock;
+
+ this.scalingDuration = scalingDuration;
+ this.clusterTimeseries = clusterTimeseries;
+ }
+
+ /** Returns the predicted duration of a rescaling of this cluster */
+ public Duration scalingDuration() { return scalingDuration; }
+
+ public ClusterNodesTimeseries nodeTimeseries() {
+ if (nodeTimeseries != null) return nodeTimeseries;
+ return nodeTimeseries = new ClusterNodesTimeseries(scalingDuration(), cluster, nodes, metricsDb);
+ }
+
+ public ClusterTimeseries clusterTimeseries() {
+ if (clusterTimeseries != null) return clusterTimeseries;
+ return clusterTimeseries = metricsDb.getClusterTimeseries(application.id(), cluster.id());
+ }
+
+ /**
+ * Returns the predicted max query growth rate per minute as a fraction of the average traffic
+ * in the scaling window
+ */
+ public double maxQueryGrowthRate() {
+ if (maxQueryGrowthRate != null) return maxQueryGrowthRate;
+ return maxQueryGrowthRate = clusterTimeseries().maxQueryGrowthRate(scalingDuration(), clock);
+ }
+
+ /** Returns the average query rate in the scaling window as a fraction of the max observed query rate */
+ public double queryFractionOfMax() {
+ if (queryFractionOfMax != null) return queryFractionOfMax;
+ return queryFractionOfMax = clusterTimeseries().queryFractionOfMax(scalingDuration(), clock);
+ }
+
+ public double averageLoad(Resource resource) { return nodeTimeseries().averageLoad(resource); }
+
+ public double idealLoad(Resource resource) {
+ switch (resource) {
+ case cpu : return idealCpuLoad();
+ case memory : return idealMemoryLoad;
+ case disk : return idealDiskLoad;
+ default : throw new IllegalStateException("No ideal load defined for " + resource);
+ }
+ }
+
+ /** Ideal cpu load must take the application traffic fraction into account */
+ private double idealCpuLoad() {
+ double queryCpuFraction = queryCpuFraction();
+
+ // What's needed to have headroom for growth during scale-up as a fraction of current resources?
+ double growthRateHeadroom = 1 + maxQueryGrowthRate() * scalingDuration().toMinutes();
+ // Cap headroom at 10% above the historical observed peak
+ if (queryFractionOfMax() != 0)
+ growthRateHeadroom = Math.min(growthRateHeadroom, 1 / queryFractionOfMax() + 0.1);
+
+ // How much headroom is needed to handle sudden arrival of additional traffic due to another zone going down?
+ double maxTrafficShiftHeadroom = 10.0; // Cap to avoid extreme sizes from a current very small share
+ double trafficShiftHeadroom;
+ if (application.status().maxReadShare() == 0) // No traffic fraction data
+ trafficShiftHeadroom = 2.0; // assume we currently get half of the global share of traffic
+ else if (application.status().currentReadShare() == 0)
+ trafficShiftHeadroom = maxTrafficShiftHeadroom;
+ else
+ trafficShiftHeadroom = application.status().maxReadShare() / application.status().currentReadShare();
+ trafficShiftHeadroom = Math.min(trafficShiftHeadroom, maxTrafficShiftHeadroom);
+
+ // Assumptions: 1) Write load is not organic so we should not grow to handle more.
+ // (TODO: But allow applications to set their target write rate and size for that)
+ // 2) Write load does not change in BCP scenarios.
+ return queryCpuFraction * 1 / growthRateHeadroom * 1 / trafficShiftHeadroom * idealQueryCpuLoad +
+ (1 - queryCpuFraction) * idealWriteCpuLoad;
+ }
+
+ private double queryCpuFraction() {
+ OptionalDouble queryRate = clusterTimeseries().queryRate(scalingDuration(), clock);
+ OptionalDouble writeRate = clusterTimeseries().writeRate(scalingDuration(), clock);
+ if (queryRate.orElse(0) == 0 && writeRate.orElse(0) == 0) return queryCpuFraction(0.5);
+ return queryCpuFraction(queryRate.orElse(0) / (queryRate.orElse(0) + writeRate.orElse(0)));
+ }
+
+ private double queryCpuFraction(double queryFraction) {
+ double relativeQueryCost = 9; // How much more expensive are queries than writes? TODO: Measure
+ double writeFraction = 1 - queryFraction;
+ return queryFraction * relativeQueryCost / (queryFraction * relativeQueryCost + writeFraction);
+ }
+
+ private static Duration computeScalingDuration(Cluster cluster, NodeList nodes) {
+ int completedEventCount = 0;
+ Duration totalDuration = Duration.ZERO;
+ for (ScalingEvent event : cluster.scalingEvents()) {
+ if (event.duration().isEmpty()) continue;
+ completedEventCount++;
+ totalDuration = totalDuration.plus(event.duration().get());
+ }
+
+ if (completedEventCount == 0) { // Use defaults
+ if (nodes.clusterSpec().isStateful()) return Duration.ofHours(12);
+ return Duration.ofMinutes(10);
+ }
+ else {
+ Duration predictedDuration = totalDuration.dividedBy(completedEventCount);
+
+ // TODO: Remove when we have reliable completion for content clusters
+ if (nodes.clusterSpec().isStateful() && predictedDuration.minus(Duration.ofHours(12)).isNegative())
+ return Duration.ofHours(12);
+
+ if (predictedDuration.minus(Duration.ofMinutes(5)).isNegative()) return Duration.ofMinutes(5); // minimum
+ return predictedDuration;
+ }
+ }
+
+}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterNodesTimeseries.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterNodesTimeseries.java
index 2d0e77742ec..c097abd8208 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterNodesTimeseries.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterNodesTimeseries.java
@@ -40,9 +40,7 @@ public class ClusterNodesTimeseries {
}
/** Returns the number of nodes measured in this */
- public int nodesMeasured() {
- return timeseries.size();
- }
+ public int nodesMeasured() { return timeseries.size(); }
/** Returns the average load of this resource in this */
public double averageLoad(Resource resource) {
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java
index e12e5442c52..79e2d3b3398 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java
@@ -22,8 +22,6 @@ public class ClusterTimeseries {
private final ClusterSpec.Id cluster;
private final List<ClusterMetricSnapshot> snapshots;
- private Double cachedMaxQueryGrowthRate = null;
-
ClusterTimeseries(ClusterSpec.Id cluster, List<ClusterMetricSnapshot> snapshots) {
this.cluster = cluster;
List<ClusterMetricSnapshot> sortedSnapshots = new ArrayList<>(snapshots);
@@ -51,12 +49,6 @@ public class ClusterTimeseries {
* The max query growth rate we can predict from this time-series as a fraction of the average traffic in the window
*/
public double maxQueryGrowthRate(Duration window, Clock clock) {
- if (cachedMaxQueryGrowthRate != null)
- return cachedMaxQueryGrowthRate;
- return cachedMaxQueryGrowthRate = computeMaxQueryGrowthRate(window, clock);
- }
-
- private double computeMaxQueryGrowthRate(Duration window, Clock clock) {
if (snapshots.isEmpty()) return 0.1;
// Find the period having the highest growth rate, where total growth exceeds 30% increase
double maxGrowthRate = 0; // In query rate per minute
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java
index c8de94eeab8..b3cf6c1e962 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java
@@ -103,7 +103,7 @@ public class MetricsResponse {
private boolean clusterIsStable(Node node, NodeList applicationNodes, NodeRepository nodeRepository) {
ClusterSpec cluster = node.allocation().get().membership().cluster();
- return Autoscaler.stable(applicationNodes.cluster(cluster.id()), nodeRepository);
+ return Autoscaler.clusterIsStable(applicationNodes.cluster(cluster.id()), nodeRepository);
}
public static MetricsResponse empty() { return new MetricsResponse(List.of()); }
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Resource.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Resource.java
index b841b31833f..c639ad1f779 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Resource.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Resource.java
@@ -12,25 +12,19 @@ public enum Resource {
/** Cpu utilization ratio */
cpu {
- public double idealAverageLoad() { return 0.8; }
double valueFrom(NodeResources resources) { return resources.vcpu(); }
},
/** Memory utilization ratio */
memory {
- public double idealAverageLoad() { return 0.7; }
double valueFrom(NodeResources resources) { return resources.memoryGb(); }
},
/** Disk utilization ratio */
disk {
- public double idealAverageLoad() { return 0.6; }
double valueFrom(NodeResources resources) { return resources.diskGb(); }
};
- /** The load we should have of this resource on average, when one node in the cluster is down */
- public abstract double idealAverageLoad();
-
abstract double valueFrom(NodeResources resources);
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTarget.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTarget.java
index 9f6a4fc77cd..b1a1e86b08d 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTarget.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTarget.java
@@ -51,18 +51,14 @@ public class ResourceTarget {
}
/** Create a target of achieving ideal load given a current load */
- public static ResourceTarget idealLoad(Duration scalingDuration,
- ClusterTimeseries clusterTimeseries,
- ClusterNodesTimeseries clusterNodesTimeseries,
- AllocatableClusterResources current,
- Application application,
- Clock clock) {
- return new ResourceTarget(nodeUsage(Resource.cpu, clusterNodesTimeseries.averageLoad(Resource.cpu), current)
- / idealCpuLoad(scalingDuration, clusterTimeseries, application, clock),
- nodeUsage(Resource.memory, clusterNodesTimeseries.averageLoad(Resource.memory), current)
- / Resource.memory.idealAverageLoad(),
- nodeUsage(Resource.disk, clusterNodesTimeseries.averageLoad(Resource.disk), current)
- / Resource.disk.idealAverageLoad(),
+ public static ResourceTarget idealLoad(ClusterModel clusterModel,
+ AllocatableClusterResources current) {
+ return new ResourceTarget(nodeUsage(Resource.cpu, clusterModel.averageLoad(Resource.cpu), current)
+ / clusterModel.idealLoad(Resource.cpu),
+ nodeUsage(Resource.memory, clusterModel.averageLoad(Resource.memory), current)
+ / clusterModel.idealLoad(Resource.memory),
+ nodeUsage(Resource.disk, clusterModel.averageLoad(Resource.disk), current)
+ / clusterModel.idealLoad(Resource.disk),
true);
}
@@ -74,58 +70,4 @@ public class ResourceTarget {
false);
}
- /** Ideal cpu load must take the application traffic fraction into account */
- public static double idealCpuLoad(Duration scalingDuration,
- ClusterTimeseries clusterTimeseries,
- Application application,
- Clock clock) {
- double queryCpuFraction = queryCpuFraction(clusterTimeseries, scalingDuration, clock);
-
- // What's needed to have headroom for growth during scale-up as a fraction of current resources?
- double maxGrowthRate = clusterTimeseries.maxQueryGrowthRate(scalingDuration, clock); // in fraction per minute of the current traffic
- double growthRateHeadroom = 1 + maxGrowthRate * scalingDuration.toMinutes();
- // Cap headroom at 10% above the historical observed peak
- double fractionOfMax = clusterTimeseries.queryFractionOfMax(scalingDuration, clock);
- if (fractionOfMax != 0)
- growthRateHeadroom = Math.min(growthRateHeadroom, 1 / fractionOfMax + 0.1);
-
- // How much headroom is needed to handle sudden arrival of additional traffic due to another zone going down?
- double maxTrafficShiftHeadroom = 10.0; // Cap to avoid extreme sizes from a current very small share
- double trafficShiftHeadroom;
- if (application.status().maxReadShare() == 0) // No traffic fraction data
- trafficShiftHeadroom = 2.0; // assume we currently get half of the global share of traffic
- else if (application.status().currentReadShare() == 0)
- trafficShiftHeadroom = maxTrafficShiftHeadroom;
- else
- trafficShiftHeadroom = application.status().maxReadShare() / application.status().currentReadShare();
- trafficShiftHeadroom = Math.min(trafficShiftHeadroom, maxTrafficShiftHeadroom);
-
- // Assumptions: 1) Write load is not organic so we should not grow to handle more.
- // (TODO: But allow applications to set their target write rate and size for that)
- // 2) Write load does not change in BCP scenarios.
- return queryCpuFraction * 1 / growthRateHeadroom * 1 / trafficShiftHeadroom * idealQueryCpuLoad() +
- (1 - queryCpuFraction) * idealWriteCpuLoad();
- }
-
- private static double queryCpuFraction(ClusterTimeseries clusterTimeseries, Duration scalingDuration, Clock clock) {
- OptionalDouble queryRate = clusterTimeseries.queryRate(scalingDuration, clock);
- OptionalDouble writeRate = clusterTimeseries.writeRate(scalingDuration, clock);
- if (queryRate.orElse(0) == 0 && writeRate.orElse(0) == 0) return queryCpuFraction(0.5);
- return queryCpuFraction(queryRate.orElse(0) / (queryRate.orElse(0) + writeRate.orElse(0)));
- }
-
- private static double queryCpuFraction(double queryFraction) {
- double relativeQueryCost = 9; // How much more expensive are queries than writes? TODO: Measure
- double writeFraction = 1 - queryFraction;
- return queryFraction * relativeQueryCost / (queryFraction * relativeQueryCost + writeFraction);
- }
-
- public static double idealQueryCpuLoad() { return Resource.cpu.idealAverageLoad(); }
-
- public static double idealWriteCpuLoad() { return 0.95; }
-
- public static double idealMemoryLoad() { return Resource.memory.idealAverageLoad(); }
-
- public static double idealDiskLoad() { return Resource.disk.idealAverageLoad(); }
-
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/ApplicationSerializer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/ApplicationSerializer.java
index 8d8d7e01049..cc59860384b 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/ApplicationSerializer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/ApplicationSerializer.java
@@ -7,9 +7,11 @@ import com.yahoo.slime.Cursor;
import com.yahoo.slime.Slime;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeList;
+import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.applications.Application;
import com.yahoo.vespa.hosted.provision.applications.Cluster;
import com.yahoo.vespa.hosted.provision.applications.ScalingEvent;
+import com.yahoo.vespa.hosted.provision.autoscale.ClusterModel;
import com.yahoo.vespa.hosted.provision.autoscale.ClusterNodesTimeseries;
import com.yahoo.vespa.hosted.provision.autoscale.ClusterTimeseries;
import com.yahoo.vespa.hosted.provision.autoscale.MetricsDb;
@@ -29,40 +31,45 @@ import java.util.List;
*/
public class ApplicationSerializer {
- public static Slime toSlime(Application application, NodeList applicationNodes, MetricsDb metricsDb, URI applicationUri) {
+ public static Slime toSlime(Application application,
+ NodeList applicationNodes,
+ MetricsDb metricsDb,
+ NodeRepository nodeRepository,
+ URI applicationUri) {
Slime slime = new Slime();
- toSlime(application, applicationNodes, metricsDb, slime.setObject(), applicationUri);
+ toSlime(application, applicationNodes, metricsDb, nodeRepository, slime.setObject(), applicationUri);
return slime;
}
private static void toSlime(Application application,
NodeList applicationNodes,
MetricsDb metricsDb,
+ NodeRepository nodeRepository,
Cursor object,
URI applicationUri) {
object.setString("url", applicationUri.toString());
object.setString("id", application.id().toFullString());
- clustersToSlime(application, applicationNodes, metricsDb, object.setObject("clusters"));
+ clustersToSlime(application, applicationNodes, metricsDb, nodeRepository, object.setObject("clusters"));
}
private static void clustersToSlime(Application application,
NodeList applicationNodes,
MetricsDb metricsDb,
+ NodeRepository nodeRepository,
Cursor clustersObject) {
- application.clusters().values().forEach(cluster -> toSlime(application, cluster, applicationNodes, metricsDb, clustersObject));
+ application.clusters().values().forEach(cluster -> toSlime(application, cluster, applicationNodes, metricsDb, nodeRepository, clustersObject));
}
private static void toSlime(Application application,
Cluster cluster,
NodeList applicationNodes,
MetricsDb metricsDb,
+ NodeRepository nodeRepository,
Cursor clustersObject) {
NodeList nodes = applicationNodes.not().retired().cluster(cluster.id());
if (nodes.isEmpty()) return;
ClusterResources currentResources = nodes.toResources();
- Duration scalingDuration = cluster.scalingDuration(nodes.clusterSpec());
- var clusterNodesTimeseries = new ClusterNodesTimeseries(Duration.ofHours(1), cluster, nodes, metricsDb);
- var clusterTimeseries = metricsDb.getClusterTimeseries(application.id(), cluster.id());
+ ClusterModel clusterModel = new ClusterModel(application, cluster, nodes, metricsDb, nodeRepository.clock());
Cursor clusterObject = clustersObject.setObject(cluster.id().value());
clusterObject.setString("type", nodes.clusterSpec().type().name());
@@ -72,12 +79,12 @@ public class ApplicationSerializer {
if (cluster.shouldSuggestResources(currentResources))
cluster.suggestedResources().ifPresent(suggested -> toSlime(suggested.resources(), clusterObject.setObject("suggested")));
cluster.targetResources().ifPresent(target -> toSlime(target, clusterObject.setObject("target")));
- clusterUtilizationToSlime(application, scalingDuration, clusterTimeseries, clusterNodesTimeseries, metricsDb.clock(), clusterObject.setObject("utilization"));
+ clusterUtilizationToSlime(clusterModel, clusterObject.setObject("utilization"));
scalingEventsToSlime(cluster.scalingEvents(), clusterObject.setArray("scalingEvents"));
clusterObject.setString("autoscalingStatus", cluster.autoscalingStatus());
- clusterObject.setLong("scalingDuration", scalingDuration.toMillis());
- clusterObject.setDouble("maxQueryGrowthRate", clusterTimeseries.maxQueryGrowthRate(scalingDuration, metricsDb.clock()));
- clusterObject.setDouble("currentQueryFractionOfMax", clusterTimeseries.queryFractionOfMax(scalingDuration, metricsDb.clock()));
+ clusterObject.setLong("scalingDuration", clusterModel.scalingDuration().toMillis());
+ clusterObject.setDouble("maxQueryGrowthRate", clusterModel.maxQueryGrowthRate());
+ clusterObject.setDouble("currentQueryFractionOfMax", clusterModel.queryFractionOfMax());
}
private static void toSlime(ClusterResources resources, Cursor clusterResourcesObject) {
@@ -86,18 +93,13 @@ public class ApplicationSerializer {
NodeResourcesSerializer.toSlime(resources.nodeResources(), clusterResourcesObject.setObject("resources"));
}
- private static void clusterUtilizationToSlime(Application application,
- Duration scalingDuration,
- ClusterTimeseries clusterTimeseries,
- ClusterNodesTimeseries clusterNodesTimeseries,
- Clock clock,
- Cursor utilizationObject) {
- utilizationObject.setDouble("cpu", clusterNodesTimeseries.averageLoad(Resource.cpu));
- utilizationObject.setDouble("idealCpu", ResourceTarget.idealCpuLoad(scalingDuration, clusterTimeseries, application, clock));
- utilizationObject.setDouble("memory", clusterNodesTimeseries.averageLoad(Resource.memory));
- utilizationObject.setDouble("idealMemory", ResourceTarget.idealMemoryLoad());
- utilizationObject.setDouble("disk", clusterNodesTimeseries.averageLoad(Resource.disk));
- utilizationObject.setDouble("idealDisk", ResourceTarget.idealDiskLoad());
+ private static void clusterUtilizationToSlime(ClusterModel clusterModel, Cursor utilizationObject) {
+ utilizationObject.setDouble("cpu", clusterModel.averageLoad(Resource.cpu));
+ utilizationObject.setDouble("idealCpu", clusterModel.idealLoad(Resource.cpu));
+ utilizationObject.setDouble("memory", clusterModel.averageLoad(Resource.memory));
+ utilizationObject.setDouble("idealMemory", clusterModel.idealLoad(Resource.memory));
+ utilizationObject.setDouble("disk", clusterModel.averageLoad(Resource.disk));
+ utilizationObject.setDouble("idealDisk", clusterModel.idealLoad(Resource.disk));
}
private static void scalingEventsToSlime(List<ScalingEvent> scalingEvents, Cursor scalingEventsArray) {
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java
index 2442ff9d565..52081877d98 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java
@@ -447,6 +447,7 @@ public class NodesV2ApiHandler extends LoggingRequestHandler {
Slime slime = ApplicationSerializer.toSlime(application.get(),
nodeRepository.nodes().list(Node.State.active).owner(id),
metricsDb,
+ nodeRepository,
withPath("/nodes/v2/applications/" + id, uri));
return new SlimeJsonResponse(slime);
}
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java
index cc3eeb47073..89da20c5550 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java
@@ -141,8 +141,8 @@ class AutoscalingTester {
clock().advance(Duration.ofMinutes(5));
for (Node node : nodes) {
float cpu = value * oneExtraNodeFactor;
- float memory = (float) Resource.memory.idealAverageLoad() * otherResourcesLoad * oneExtraNodeFactor;
- float disk = (float) Resource.disk.idealAverageLoad() * otherResourcesLoad * oneExtraNodeFactor;
+ float memory = (float) ClusterModel.idealMemoryLoad * otherResourcesLoad * oneExtraNodeFactor;
+ float disk = (float) ClusterModel.idealDiskLoad * otherResourcesLoad * oneExtraNodeFactor;
db.addNodeMetrics(List.of(new Pair<>(node.hostname(), new NodeMetricSnapshot(clock().instant(),
cpu,
memory,
@@ -174,7 +174,7 @@ class AutoscalingTester {
for (Node node : nodes) {
float cpu = (float) 0.2 * otherResourcesLoad * oneExtraNodeFactor;
float memory = value * oneExtraNodeFactor;
- float disk = (float) Resource.disk.idealAverageLoad() * otherResourcesLoad * oneExtraNodeFactor;
+ float disk = (float) ClusterModel.idealDiskLoad * otherResourcesLoad * oneExtraNodeFactor;
db.addNodeMetrics(List.of(new Pair<>(node.hostname(), new NodeMetricSnapshot(clock().instant(),
cpu,
memory,
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTargetTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModelTest.java
index 6f60de62f1f..70550b0a7c3 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTargetTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModelTest.java
@@ -22,7 +22,7 @@ import static org.junit.Assert.assertEquals;
/**
* @author bratseth
*/
-public class ResourceTargetTest {
+public class ClusterModelTest {
private static final double delta = 0.001;
@@ -34,22 +34,16 @@ public class ResourceTargetTest {
application = application.with(cluster);
// No current traffic share: Ideal load is low but capped
- application = application.with(new Status(0.0, 1.0));
- assertEquals(0.131,
- ResourceTarget.idealCpuLoad(Duration.ofMinutes(10),
- timeseries(cluster,100, t -> t == 0 ? 10000.0 : 0.0, t -> 0.0, clock),
- application,
- clock),
- delta);
+ var model1 = new ClusterModel(application.with(new Status(0.0, 1.0)),
+ cluster, clock, Duration.ofMinutes(10),
+ timeseries(cluster,100, t -> t == 0 ? 10000.0 : 0.0, t -> 0.0, clock));
+ assertEquals(0.131, model1.idealLoad(Resource.cpu), delta);
// Almost no current traffic share: Ideal load is low but capped
- application = application.with(new Status(0.0001, 1.0));
- assertEquals(0.131,
- ResourceTarget.idealCpuLoad(Duration.ofMinutes(10),
- timeseries(cluster,100, t -> t == 0 ? 10000.0 : 0.0, t -> 0.0, clock),
- application,
- clock),
- delta);
+ var model2 = new ClusterModel(application.with(new Status(0.0001, 1.0)),
+ cluster, clock, Duration.ofMinutes(10),
+ timeseries(cluster,100, t -> t == 0 ? 10000.0 : 0.0, t -> 0.0, clock));
+ assertEquals(0.131, model2.idealLoad(Resource.cpu), delta);
}
@Test
@@ -61,21 +55,16 @@ public class ResourceTargetTest {
application = application.with(cluster);
// No current traffic: Ideal load is low but capped
- assertEquals(0.275,
- ResourceTarget.idealCpuLoad(Duration.ofMinutes(10),
- timeseries(cluster,100, t -> t == 0 ? 10000.0 : 0.0, t -> 0.0, clock),
- application,
- clock),
- delta);
+ var model1 = new ClusterModel(application,
+ cluster, clock, Duration.ofMinutes(10),
+ timeseries(cluster,100, t -> t == 0 ? 10000.0 : 0.0, t -> 0.0, clock));
+ assertEquals(0.275, model1.idealLoad(Resource.cpu), delta);
// Almost current traffic: Ideal load is low but capped
- application = application.with(new Status(0.0001, 1.0));
- assertEquals(0.04,
- ResourceTarget.idealCpuLoad(Duration.ofMinutes(10),
- timeseries(cluster,100, t -> t == 0 ? 10000.0 : 0.0001, t -> 0.0, clock),
- application,
- clock),
- delta);
+ var model2 = new ClusterModel(application.with(new Status(0.0001, 1.0)),
+ cluster, clock, Duration.ofMinutes(10),
+ timeseries(cluster,100, t -> t == 0 ? 10000.0 : 0.0001, t -> 0.0, clock));
+ assertEquals(0.275, model1.idealLoad(Resource.cpu), delta);
}
private Cluster cluster(NodeResources resources) {
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java
index 9ae67cef235..10851252c98 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java
@@ -17,6 +17,7 @@ import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeList;
import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.applications.Cluster;
+import com.yahoo.vespa.hosted.provision.autoscale.ClusterModel;
import com.yahoo.vespa.hosted.provision.autoscale.NodeMetricSnapshot;
import com.yahoo.vespa.hosted.provision.autoscale.MetricsDb;
import com.yahoo.vespa.hosted.provision.autoscale.Resource;
@@ -99,9 +100,7 @@ public class ScalingSuggestionsMaintainerTest {
var suggested = tester.nodeRepository().applications().get(app1).get().cluster(cluster1.id()).get().suggestedResources().get().resources();
tester.deploy(app1, cluster1, Capacity.from(suggested, suggested, false, true));
tester.clock().advance(Duration.ofDays(2));
- addMeasurements(0.2f,
- (float)Resource.memory.idealAverageLoad(),
- (float)Resource.disk.idealAverageLoad(),
+ addMeasurements(0.2f, 0.7f, 0.6f,
0, 500, app1, tester.nodeRepository(), metricsDb);
maintainer.maintain();
assertEquals("Suggestion is to keep the current allocation",