diff options
Diffstat (limited to 'node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java')
-rw-r--r-- | node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java | 87 |
1 files changed, 85 insertions, 2 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java index 5b1ee6cc496..ae18e7ffb91 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java @@ -33,6 +33,11 @@ public class ClusterModel { static final double idealContainerDiskLoad = 0.95; static final double idealContentDiskLoad = 0.6; + // When a query is issued on a node the cost is the sum of a fixed cost component and a cost component + // proportional to document count. We must account for this when comparing configurations with more or fewer nodes. + // TODO: Measure this, and only take it into account with queries + private static final double fixedCpuCostFraction = 0.1; + private final Application application; private final ClusterSpec clusterSpec; private final Cluster cluster; @@ -74,7 +79,7 @@ public class ClusterModel { this.application = application; this.clusterSpec = clusterSpec; this.cluster = cluster; - this.nodes = null; + this.nodes = NodeList.of(); this.clock = clock; this.scalingDuration = scalingDuration; @@ -86,6 +91,20 @@ public class ClusterModel { public ClusterSpec clusterSpec() { return clusterSpec; } public Cluster cluster() { return cluster; } + /** Returns the relative load adjustment that should be made to this cluster given available measurements. */ + public Load loadAdjustment() { + if (nodeTimeseries().measurementsPerNode() == 0) return Load.one(); // No info, no change + /* + // Should we scale up? + Load relativePeak = nodeTimeseries().peakLoad().divide(idealLoad()); + if (relativePeak.any(v -> v > 1)) + return relativePeak.max(Load.one()); // Don't downscale any dimension if we upscale + + // Should we scale down? + */ + return averageLoad().divide(idealLoad()); + } + /** Returns the predicted duration of a rescaling of this cluster */ public Duration scalingDuration() { return scalingDuration; } @@ -114,8 +133,72 @@ public class ClusterModel { /** Returns average load during the last {@link #scalingDuration()} */ public Load averageLoad() { return nodeTimeseries().averageLoad(clock.instant().minus(scalingDuration())); } + /** The number of nodes this cluster has, or will have if not deployed yet. */ + // TODO: Make this the deployed, not current count + public int nodeCount() { + if ( ! nodes.isEmpty()) return (int)nodes.stream().count(); + return cluster.minResources().nodes(); + } + + /** The number of groups this cluster has, or will have if not deployed yet. */ + // TODO: Make this the deployed, not current count + public int groupCount() { + if ( ! nodes.isEmpty()) return (int)nodes.stream().mapToInt(node -> node.allocation().get().membership().cluster().group().get().index()).distinct().count(); + return cluster.minResources().groups(); + } + + public int groupSize() { + // ceil: If the division does not produce a whole number we assume some node is missing + return (int)Math.ceil((double)nodeCount() / groupCount()); + } + + /** Returns the relative load adjustment accounting for redundancy in this. */ + public Load redundancyAdjustment() { + return loadWith(nodeCount(), groupCount()); + } + + /** + * Returns the relative load adjustment accounting for redundancy given these nodes+groups + * relative to node nodes+groups in this. + */ + public Load loadWith(int trueNodes, int trueGroups) { + int nodes = nodesAdjustedForRedundancy(trueNodes, trueGroups); + int groups = groupsAdjustedForRedundancy(trueNodes, trueGroups); + if (clusterSpec().type() == ClusterSpec.Type.content) { // load scales with node share of content + int groupSize = nodes / groups; + + // Cpu: Query cpu scales with cluster size, write cpu scales with group size + // Memory and disk: Scales with group size + + // The fixed cost portion of cpu does not scale with changes to the node count + double queryCpuPerGroup = fixedCpuCostFraction + (1 - fixedCpuCostFraction) * groupSize() / groupSize; + + double queryCpu = queryCpuPerGroup * groupCount() / groups; + double writeCpu = (double)groupSize() / groupSize; + return new Load(queryCpuFraction() * queryCpu + (1 - queryCpuFraction()) * writeCpu, + (double)groupSize() / groupSize, + (double)groupSize() / groupSize); + } + else { + return new Load((double)nodeCount() / nodes, 1, 1); + } + } + + /** + * Returns the ideal load across the nodes of this sich that each node will be at ideal load + * if one of the nodes go down. + */ public Load idealLoad() { - return new Load(idealCpuLoad(), idealMemoryLoad, idealDiskLoad()); + return new Load(idealCpuLoad(), idealMemoryLoad, idealDiskLoad()).divide(redundancyAdjustment()); + } + + public int nodesAdjustedForRedundancy(int nodes, int groups) { + int groupSize = (int)Math.ceil((double)nodes / groups); + return nodes > 1 ? (groups == 1 ? nodes - 1 : nodes - groupSize) : nodes; + } + + public int groupsAdjustedForRedundancy(int nodes, int groups) { + return nodes > 1 ? (groups == 1 ? 1 : groups - 1) : groups; } /** Ideal cpu load must take the application traffic fraction into account */ |