diff options
Diffstat (limited to 'node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale')
7 files changed, 220 insertions, 72 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocatableClusterResources.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocatableClusterResources.java index 88a4b492a0b..4a1545cc66c 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocatableClusterResources.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocatableClusterResources.java @@ -64,6 +64,33 @@ public class AllocatableClusterResources { this.fulfilment = fulfilment(realResources, idealResources); } + private AllocatableClusterResources(int nodes, + int groups, + NodeResources realResources, + NodeResources advertisedResources, + ClusterSpec clusterSpec, + double fulfilment) { + this.nodes = nodes; + this.groups = groups; + this.realResources = realResources; + this.advertisedResources = advertisedResources; + this.clusterSpec = clusterSpec; + this.fulfilment = fulfilment; + } + + /** Returns this with the redundant node or group removed from counts. */ + public AllocatableClusterResources withoutRedundancy() { + int groupSize = nodes / groups; + int nodesAdjustedForRedundancy = nodes > 1 ? (groups == 1 ? nodes - 1 : nodes - groupSize) : nodes; + int groupsAdjustedForRedundancy = nodes > 1 ? (groups == 1 ? 1 : groups - 1) : groups; + return new AllocatableClusterResources(nodesAdjustedForRedundancy, + groupsAdjustedForRedundancy, + realResources, + advertisedResources, + clusterSpec, + fulfilment); + } + /** * Returns the resources which will actually be available per node in this cluster with this allocation. * These should be used for reasoning about allocation to meet measured demand. @@ -83,11 +110,6 @@ public class AllocatableClusterResources { public int nodes() { return nodes; } public int groups() { return groups; } - public int groupSize() { - // ceil: If the division does not produce a whole number we assume some node is missing - return (int)Math.ceil((double)nodes / groups); - } - public ClusterSpec clusterSpec() { return clusterSpec; } public double cost() { return nodes * advertisedResources.cost(); } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocationOptimizer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocationOptimizer.java index 5bebd346bdb..29f53f0336d 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocationOptimizer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocationOptimizer.java @@ -2,7 +2,6 @@ package com.yahoo.vespa.hosted.provision.autoscale; import com.yahoo.config.provision.ClusterResources; -import com.yahoo.config.provision.ClusterSpec; import com.yahoo.config.provision.NodeResources; import com.yahoo.vespa.hosted.provision.NodeList; import com.yahoo.vespa.hosted.provision.NodeRepository; @@ -20,11 +19,6 @@ public class AllocationOptimizer { private static final int minimumNodes = 2; // Since this number includes redundancy it cannot be lower than 2 private static final int maximumNodes = 150; - // When a query is issued on a node the cost is the sum of a fixed cost component and a cost component - // proportional to document count. We must account for this when comparing configurations with more or fewer nodes. - // TODO: Measure this, and only take it into account with queries - private static final double fixedCpuCostFraction = 0.1; - private final NodeRepository nodeRepository; public AllocationOptimizer(NodeRepository nodeRepository) { @@ -53,17 +47,10 @@ public class AllocationOptimizer { for (int groups = limits.min().groups(); groups <= limits.max().groups(); groups++) { for (int nodes = limits.min().nodes(); nodes <= limits.max().nodes(); nodes++) { if (nodes % groups != 0) continue; - int groupSize = nodes / groups; - - // Adjust for redundancy: Node in group if groups = 1, an extra group if multiple groups - // TODO: Make the best choice based on size and redundancy setting instead - int nodesAdjustedForRedundancy = target.adjustForRedundancy() && nodes > 1 ? (groups == 1 ? nodes - 1 : nodes - groupSize) : nodes; - int groupsAdjustedForRedundancy = target.adjustForRedundancy() && nodes > 1 ? (groups == 1 ? 1 : groups - 1) : groups; ClusterResources next = new ClusterResources(nodes, groups, - nodeResourcesWith(nodesAdjustedForRedundancy, - groupsAdjustedForRedundancy, + nodeResourcesWith(nodes, groups, limits, target, current, clusterModel)); var allocatableResources = AllocatableClusterResources.from(next, current.clusterSpec(), limits, hosts, nodeRepository); @@ -85,34 +72,14 @@ public class AllocationOptimizer { ResourceTarget target, AllocatableClusterResources current, ClusterModel clusterModel) { - double cpu, memory, disk; - int groupSize = nodes / groups; - - if (current.clusterSpec().type() == ClusterSpec.Type.content) { // load scales with node share of content - // Cpu: Query cpu scales with cluster size, write cpu scales with group size - // Memory and disk: Scales with group size - - // The fixed cost portion of cpu does not scale with changes to the node count - double queryCpuPerGroup = fixedCpuCostFraction * target.resources().vcpu() + - (1 - fixedCpuCostFraction) * target.resources().vcpu() * current.groupSize() / groupSize; - - double queryCpu = queryCpuPerGroup * current.groups() / groups; - double writeCpu = target.resources().vcpu() * current.groupSize() / groupSize; - cpu = clusterModel.queryCpuFraction() * queryCpu + (1 - clusterModel.queryCpuFraction()) * writeCpu; - memory = target.resources().memoryGb() * current.groupSize() / groupSize; - disk = target.resources().diskGb() * current.groupSize() / groupSize; - } - else { - cpu = target.resources().vcpu() * current.nodes() / nodes; - memory = target.resources().memoryGb(); - disk = target.resources().diskGb(); - } + var scaled = clusterModel.loadWith(nodes, groups) + .scaled(Load.one().divide(clusterModel.redundancyAdjustment()).scaled(target.resources())); // Combine the scaled resource values computed here // with the currently configured non-scaled values, given in the limits, if any - NodeResources nonScaled = limits.isEmpty() || limits.min().nodeResources().isUnspecified() - ? current.advertisedResources().nodeResources() - : limits.min().nodeResources(); // min=max for non-scaled - return nonScaled.withVcpu(cpu).withMemoryGb(memory).withDiskGb(disk); + var nonScaled = limits.isEmpty() || limits.min().nodeResources().isUnspecified() + ? current.advertisedResources().nodeResources() + : limits.min().nodeResources(); // min=max for non-scaled + return nonScaled.withVcpu(scaled.vcpu()).withMemoryGb(scaled.memoryGb()).withDiskGb(scaled.diskGb()); } /** Returns a copy of the given limits where the minimum nodes are at least the given value when allowed */ diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java index 5b1ee6cc496..ae18e7ffb91 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java @@ -33,6 +33,11 @@ public class ClusterModel { static final double idealContainerDiskLoad = 0.95; static final double idealContentDiskLoad = 0.6; + // When a query is issued on a node the cost is the sum of a fixed cost component and a cost component + // proportional to document count. We must account for this when comparing configurations with more or fewer nodes. + // TODO: Measure this, and only take it into account with queries + private static final double fixedCpuCostFraction = 0.1; + private final Application application; private final ClusterSpec clusterSpec; private final Cluster cluster; @@ -74,7 +79,7 @@ public class ClusterModel { this.application = application; this.clusterSpec = clusterSpec; this.cluster = cluster; - this.nodes = null; + this.nodes = NodeList.of(); this.clock = clock; this.scalingDuration = scalingDuration; @@ -86,6 +91,20 @@ public class ClusterModel { public ClusterSpec clusterSpec() { return clusterSpec; } public Cluster cluster() { return cluster; } + /** Returns the relative load adjustment that should be made to this cluster given available measurements. */ + public Load loadAdjustment() { + if (nodeTimeseries().measurementsPerNode() == 0) return Load.one(); // No info, no change + /* + // Should we scale up? + Load relativePeak = nodeTimeseries().peakLoad().divide(idealLoad()); + if (relativePeak.any(v -> v > 1)) + return relativePeak.max(Load.one()); // Don't downscale any dimension if we upscale + + // Should we scale down? + */ + return averageLoad().divide(idealLoad()); + } + /** Returns the predicted duration of a rescaling of this cluster */ public Duration scalingDuration() { return scalingDuration; } @@ -114,8 +133,72 @@ public class ClusterModel { /** Returns average load during the last {@link #scalingDuration()} */ public Load averageLoad() { return nodeTimeseries().averageLoad(clock.instant().minus(scalingDuration())); } + /** The number of nodes this cluster has, or will have if not deployed yet. */ + // TODO: Make this the deployed, not current count + public int nodeCount() { + if ( ! nodes.isEmpty()) return (int)nodes.stream().count(); + return cluster.minResources().nodes(); + } + + /** The number of groups this cluster has, or will have if not deployed yet. */ + // TODO: Make this the deployed, not current count + public int groupCount() { + if ( ! nodes.isEmpty()) return (int)nodes.stream().mapToInt(node -> node.allocation().get().membership().cluster().group().get().index()).distinct().count(); + return cluster.minResources().groups(); + } + + public int groupSize() { + // ceil: If the division does not produce a whole number we assume some node is missing + return (int)Math.ceil((double)nodeCount() / groupCount()); + } + + /** Returns the relative load adjustment accounting for redundancy in this. */ + public Load redundancyAdjustment() { + return loadWith(nodeCount(), groupCount()); + } + + /** + * Returns the relative load adjustment accounting for redundancy given these nodes+groups + * relative to node nodes+groups in this. + */ + public Load loadWith(int trueNodes, int trueGroups) { + int nodes = nodesAdjustedForRedundancy(trueNodes, trueGroups); + int groups = groupsAdjustedForRedundancy(trueNodes, trueGroups); + if (clusterSpec().type() == ClusterSpec.Type.content) { // load scales with node share of content + int groupSize = nodes / groups; + + // Cpu: Query cpu scales with cluster size, write cpu scales with group size + // Memory and disk: Scales with group size + + // The fixed cost portion of cpu does not scale with changes to the node count + double queryCpuPerGroup = fixedCpuCostFraction + (1 - fixedCpuCostFraction) * groupSize() / groupSize; + + double queryCpu = queryCpuPerGroup * groupCount() / groups; + double writeCpu = (double)groupSize() / groupSize; + return new Load(queryCpuFraction() * queryCpu + (1 - queryCpuFraction()) * writeCpu, + (double)groupSize() / groupSize, + (double)groupSize() / groupSize); + } + else { + return new Load((double)nodeCount() / nodes, 1, 1); + } + } + + /** + * Returns the ideal load across the nodes of this sich that each node will be at ideal load + * if one of the nodes go down. + */ public Load idealLoad() { - return new Load(idealCpuLoad(), idealMemoryLoad, idealDiskLoad()); + return new Load(idealCpuLoad(), idealMemoryLoad, idealDiskLoad()).divide(redundancyAdjustment()); + } + + public int nodesAdjustedForRedundancy(int nodes, int groups) { + int groupSize = (int)Math.ceil((double)nodes / groups); + return nodes > 1 ? (groups == 1 ? nodes - 1 : nodes - groupSize) : nodes; + } + + public int groupsAdjustedForRedundancy(int nodes, int groups) { + return nodes > 1 ? (groups == 1 ? 1 : groups - 1) : groups; } /** Ideal cpu load must take the application traffic fraction into account */ diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterNodesTimeseries.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterNodesTimeseries.java index 36056665a15..ab5be045dd4 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterNodesTimeseries.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterNodesTimeseries.java @@ -8,6 +8,7 @@ import java.time.Duration; import java.time.Instant; import java.util.List; import java.util.Optional; +import java.util.OptionalDouble; import java.util.function.Predicate; import java.util.stream.Collectors; @@ -49,6 +50,7 @@ public class ClusterNodesTimeseries { /** Returns the average number of measurements per node */ public int measurementsPerNode() { + if (clusterNodes.size() == 0) return 0; int measurementCount = timeseries.stream().mapToInt(m -> m.size()).sum(); return measurementCount / clusterNodes.size(); } @@ -84,6 +86,27 @@ public class ClusterNodesTimeseries { return total.divide(count); } + /** + * Returns the "peak load" in this: Which is for each load dimension, + * the average of the highest reading for that dimension on each node. + */ + public Load peakLoad() { + return new Load(peakLoad(Load.Dimension.cpu), peakLoad(Load.Dimension.memory), peakLoad(Load.Dimension.disk)); + } + + private double peakLoad(Load.Dimension dimension) { + double total = 0; + int count = 0; + for (var nodeTimeseries : timeseries) { + OptionalDouble value = nodeTimeseries.peak(dimension); + if (value.isEmpty()) continue; + total += value.getAsDouble(); + count++; + } + if (count == 0) return 0; + return total / count; + } + private static List<NodeTimeseries> keep(List<NodeTimeseries> timeseries, Predicate<NodeMetricSnapshot> filter) { return timeseries.stream().map(nodeTimeseries -> nodeTimeseries.keep(filter)).collect(Collectors.toList()); } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Load.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Load.java index a52b048a9e0..88c7e70cd35 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Load.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Load.java @@ -3,6 +3,12 @@ package com.yahoo.vespa.hosted.provision.autoscale; import com.yahoo.config.provision.NodeResources; +import java.util.Objects; +import java.util.function.DoubleBinaryOperator; +import java.util.function.DoubleFunction; +import java.util.function.DoubleUnaryOperator; +import java.util.function.Predicate; + /** * The load of a node or system, measured as fractions of max (1.0) in three dimensions. * @@ -10,6 +16,8 @@ import com.yahoo.config.provision.NodeResources; */ public class Load { + public enum Dimension { cpu, memory, disk } + private final double cpu, memory, disk; public Load(double cpu, double memory, double disk) { @@ -23,27 +31,51 @@ public class Load { public double disk() { return disk; } public Load add(Load other) { - return new Load(cpu + other.cpu(), memory + other.memory(), disk + other.disk()); + return join(other, (a, b) -> a + b); } public Load multiply(NodeResources resources) { return new Load(cpu * resources.vcpu(), memory * resources.memoryGb(), disk * resources.diskGb()); } - public Load multiply(double factor) { - return new Load(cpu * factor, memory * factor, disk * factor); + return map(v -> v * factor); + } + public Load multiply(Load other) { + return join(other, (a, b) -> a * b); } + public Load divide(Load divisor) { + return join(divisor, (a, b) -> divide(a, b)); + } + public Load divide(double divisor) { + return map(v -> divide(v, divisor)); + } public Load divide(NodeResources resources) { return new Load(divide(cpu, resources.vcpu()), divide(memory, resources.memoryGb()), divide(disk, resources.diskGb())); } - public Load divide(Load divisor) { - return new Load(divide(cpu, divisor.cpu()), divide(memory, divisor.memory()), divide(disk, divisor.disk())); + /** Returns the load having the max value of this and the given load in each dimension. */ + public Load max(Load other) { + return join(other, (a, b) -> Math.max(a, b)); } - public Load divide(double divisor) { - return new Load(divide(cpu, divisor), divide(memory, divisor), divide(disk, divisor)); + /** Returns the load where the given function is applied to each dimension of this. */ + public Load map(DoubleUnaryOperator f) { + return new Load(f.applyAsDouble(cpu), + f.applyAsDouble(memory), + f.applyAsDouble(disk)); + } + + /** Returns the load where the given function is applied to each dimension of this and the given load. */ + public Load join(Load other, DoubleBinaryOperator f) { + return new Load(f.applyAsDouble(this.cpu(), other.cpu()), + f.applyAsDouble(this.memory(), other.memory()), + f.applyAsDouble(this.disk(), other.disk())); + } + + /** Returns true if any dimension matches the predicate. */ + public boolean any(Predicate<Double> test) { + return test.test(cpu) || test.test(memory) || test.test(disk); } public NodeResources scaled(NodeResources resources) { @@ -52,6 +84,14 @@ public class Load { .withDiskGb(disk * resources.diskGb()); } + public double get(Dimension dimension) { + return switch (dimension) { + case cpu -> cpu(); + case memory -> memory(); + case disk -> disk(); + }; + } + private double requireNormalized(double value, String name) { if (Double.isNaN(value)) throw new IllegalArgumentException(name + " must be a number but is NaN"); @@ -60,17 +100,31 @@ public class Load { return value; } + private static double divide(double a, double b) { + if (a == 0 && b == 0) return 0; + return a / b; + } + + @Override + public boolean equals(Object o) { + if (o == this) return true; + if ( ! (o instanceof Load other)) return false; + if (other.cpu() != this.cpu()) return false; + if (other.memory() != this.memory()) return false; + if (other.disk() != this.disk()) return false; + return true; + } + + @Override + public int hashCode() { return Objects.hash(cpu, memory, disk); } + @Override public String toString() { return "load: " + cpu + " cpu, " + memory + " memory, " + disk + " disk"; } public static Load zero() { return new Load(0, 0, 0); } - - private static double divide(double a, double b) { - if (a == 0 && b == 0) return 0; - return a / b; - } + public static Load one() { return new Load(1, 1, 1); } public static Load byDividing(NodeResources a, NodeResources b) { return new Load(divide(a.vcpu(), b.vcpu()), divide(a.memoryGb(), b.memoryGb()), divide(a.diskGb(), b.diskGb())); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/NodeTimeseries.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/NodeTimeseries.java index 4a5f8972e11..500dbf0f66f 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/NodeTimeseries.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/NodeTimeseries.java @@ -6,6 +6,7 @@ import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Optional; +import java.util.OptionalDouble; import java.util.function.Predicate; import java.util.stream.Collectors; @@ -40,6 +41,10 @@ public class NodeTimeseries { return Optional.of(snapshots.get(snapshots.size() - 1)); } + public OptionalDouble peak(Load.Dimension dimension) { + return snapshots.stream().mapToDouble(snapshot -> snapshot.load().get(dimension)).max(); + } + public List<NodeMetricSnapshot> asList() { return snapshots; } public String hostname() { return hostname; } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTarget.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTarget.java index 72836baaf5b..7bc019caabb 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTarget.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTarget.java @@ -10,43 +10,37 @@ import java.util.OptionalDouble; /** * A resource target to hit for the allocation optimizer. - * The target is measured in cpu, memory and disk per node in the allocation given by current. + * The target is measured in cpu, memory and disk per node in the current allocation. * * @author bratseth */ public class ResourceTarget { - private final boolean adjustForRedundancy; - /** The target real resources per node, assuming the node assignment where this was decided */ private final NodeResources resources; - private ResourceTarget(NodeResources resources, boolean adjustForRedundancy) { + private ResourceTarget(NodeResources resources) { this.resources = resources; - this.adjustForRedundancy = adjustForRedundancy; } - /** Are the target resources given by this including redundancy or not */ - public boolean adjustForRedundancy() { return adjustForRedundancy; } - /** Returns the target resources per node in terms of the current allocation */ public NodeResources resources() { return resources; } @Override public String toString() { - return "target " + resources + (adjustForRedundancy ? "(with redundancy adjustment) " : ""); + return "target " + resources; } /** Create a target of achieving ideal load given a current load */ public static ResourceTarget idealLoad(ClusterModel clusterModel, AllocatableClusterResources current) { - var loadAdjustment = clusterModel.averageLoad().divide(clusterModel.idealLoad()); - return new ResourceTarget(loadAdjustment.scaled(current.realResources().nodeResources()), true); + return new ResourceTarget(clusterModel.loadAdjustment().scaled(current.realResources().nodeResources())); } /** Crete a target of preserving a current allocation */ - public static ResourceTarget preserve(AllocatableClusterResources current) { - return new ResourceTarget(current.realResources().nodeResources(), false); + public static ResourceTarget preserve(ClusterModel clusterModel, + AllocatableClusterResources current) { + return new ResourceTarget(current.realResources().nodeResources()); } } |