summaryrefslogtreecommitdiffstats
path: root/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java
diff options
context:
space:
mode:
Diffstat (limited to 'node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java')
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java87
1 files changed, 85 insertions, 2 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java
index 5b1ee6cc496..ae18e7ffb91 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java
@@ -33,6 +33,11 @@ public class ClusterModel {
static final double idealContainerDiskLoad = 0.95;
static final double idealContentDiskLoad = 0.6;
+ // When a query is issued on a node the cost is the sum of a fixed cost component and a cost component
+ // proportional to document count. We must account for this when comparing configurations with more or fewer nodes.
+ // TODO: Measure this, and only take it into account with queries
+ private static final double fixedCpuCostFraction = 0.1;
+
private final Application application;
private final ClusterSpec clusterSpec;
private final Cluster cluster;
@@ -74,7 +79,7 @@ public class ClusterModel {
this.application = application;
this.clusterSpec = clusterSpec;
this.cluster = cluster;
- this.nodes = null;
+ this.nodes = NodeList.of();
this.clock = clock;
this.scalingDuration = scalingDuration;
@@ -86,6 +91,20 @@ public class ClusterModel {
public ClusterSpec clusterSpec() { return clusterSpec; }
public Cluster cluster() { return cluster; }
+ /** Returns the relative load adjustment that should be made to this cluster given available measurements. */
+ public Load loadAdjustment() {
+ if (nodeTimeseries().measurementsPerNode() == 0) return Load.one(); // No info, no change
+ /*
+ // Should we scale up?
+ Load relativePeak = nodeTimeseries().peakLoad().divide(idealLoad());
+ if (relativePeak.any(v -> v > 1))
+ return relativePeak.max(Load.one()); // Don't downscale any dimension if we upscale
+
+ // Should we scale down?
+ */
+ return averageLoad().divide(idealLoad());
+ }
+
/** Returns the predicted duration of a rescaling of this cluster */
public Duration scalingDuration() { return scalingDuration; }
@@ -114,8 +133,72 @@ public class ClusterModel {
/** Returns average load during the last {@link #scalingDuration()} */
public Load averageLoad() { return nodeTimeseries().averageLoad(clock.instant().minus(scalingDuration())); }
+ /** The number of nodes this cluster has, or will have if not deployed yet. */
+ // TODO: Make this the deployed, not current count
+ public int nodeCount() {
+ if ( ! nodes.isEmpty()) return (int)nodes.stream().count();
+ return cluster.minResources().nodes();
+ }
+
+ /** The number of groups this cluster has, or will have if not deployed yet. */
+ // TODO: Make this the deployed, not current count
+ public int groupCount() {
+ if ( ! nodes.isEmpty()) return (int)nodes.stream().mapToInt(node -> node.allocation().get().membership().cluster().group().get().index()).distinct().count();
+ return cluster.minResources().groups();
+ }
+
+ public int groupSize() {
+ // ceil: If the division does not produce a whole number we assume some node is missing
+ return (int)Math.ceil((double)nodeCount() / groupCount());
+ }
+
+ /** Returns the relative load adjustment accounting for redundancy in this. */
+ public Load redundancyAdjustment() {
+ return loadWith(nodeCount(), groupCount());
+ }
+
+ /**
+ * Returns the relative load adjustment accounting for redundancy given these nodes+groups
+ * relative to node nodes+groups in this.
+ */
+ public Load loadWith(int trueNodes, int trueGroups) {
+ int nodes = nodesAdjustedForRedundancy(trueNodes, trueGroups);
+ int groups = groupsAdjustedForRedundancy(trueNodes, trueGroups);
+ if (clusterSpec().type() == ClusterSpec.Type.content) { // load scales with node share of content
+ int groupSize = nodes / groups;
+
+ // Cpu: Query cpu scales with cluster size, write cpu scales with group size
+ // Memory and disk: Scales with group size
+
+ // The fixed cost portion of cpu does not scale with changes to the node count
+ double queryCpuPerGroup = fixedCpuCostFraction + (1 - fixedCpuCostFraction) * groupSize() / groupSize;
+
+ double queryCpu = queryCpuPerGroup * groupCount() / groups;
+ double writeCpu = (double)groupSize() / groupSize;
+ return new Load(queryCpuFraction() * queryCpu + (1 - queryCpuFraction()) * writeCpu,
+ (double)groupSize() / groupSize,
+ (double)groupSize() / groupSize);
+ }
+ else {
+ return new Load((double)nodeCount() / nodes, 1, 1);
+ }
+ }
+
+ /**
+ * Returns the ideal load across the nodes of this sich that each node will be at ideal load
+ * if one of the nodes go down.
+ */
public Load idealLoad() {
- return new Load(idealCpuLoad(), idealMemoryLoad, idealDiskLoad());
+ return new Load(idealCpuLoad(), idealMemoryLoad, idealDiskLoad()).divide(redundancyAdjustment());
+ }
+
+ public int nodesAdjustedForRedundancy(int nodes, int groups) {
+ int groupSize = (int)Math.ceil((double)nodes / groups);
+ return nodes > 1 ? (groups == 1 ? nodes - 1 : nodes - groupSize) : nodes;
+ }
+
+ public int groupsAdjustedForRedundancy(int nodes, int groups) {
+ return nodes > 1 ? (groups == 1 ? 1 : groups - 1) : groups;
}
/** Ideal cpu load must take the application traffic fraction into account */