diff options
author | Jon Bratseth <bratseth@vespa.ai> | 2023-08-15 12:33:43 +0200 |
---|---|---|
committer | Jon Bratseth <bratseth@vespa.ai> | 2023-08-15 12:33:43 +0200 |
commit | ee2cde5a803ee3f553b7495eb642b455d19ca64f (patch) | |
tree | 6cff4442dd6d21eff528deab9156973dead20094 | |
parent | 0ad86ce2fdd0c357c8fc271bfd3f2d8f860b2125 (diff) |
Consider switching cost when choosing resources
8 files changed, 144 insertions, 12 deletions
diff --git a/document/src/test/java/com/yahoo/document/DocumentTestCase.java b/document/src/test/java/com/yahoo/document/DocumentTestCase.java index 33b77cb1878..e5f6453c581 100644 --- a/document/src/test/java/com/yahoo/document/DocumentTestCase.java +++ b/document/src/test/java/com/yahoo/document/DocumentTestCase.java @@ -42,7 +42,7 @@ import static org.junit.Assert.fail; /** * Test for Document and all its features, including (de)serialization. * - * @author <a href="thomasg@yahoo-inc.com>Thomas Gundersen</a> + * @author Thomas Gundersen * @author bratseth */ public class DocumentTestCase extends DocumentTestCaseBase { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/Cluster.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/Cluster.java index 1ca81df824b..796bc2eeb92 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/Cluster.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/Cluster.java @@ -208,6 +208,16 @@ public class Cluster { return minimum(ClusterModel.minScalingDuration(clusterSpec), totalDuration.dividedBy(completedEventCount)); } + /** The predicted time this cluster will stay in each resource configuration (including the scaling duration). */ + public Duration allocationDuration(ClusterSpec clusterSpec) { + if (scalingEvents.size() < 2) return Duration.ofHours(12); // Default + + long totalDurationMs = 0; + for (int i = 1; i < scalingEvents().size(); i++) + totalDurationMs += scalingEvents().get(i).at().toEpochMilli() - scalingEvents().get(i - 1).at().toEpochMilli(); + return Duration.ofMillis(totalDurationMs / (scalingEvents.size() - 1)); + } + private static Duration minimum(Duration smallestAllowed, Duration duration) { if (duration.minus(smallestAllowed).isNegative()) return smallestAllowed; diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocatableClusterResources.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocatableClusterResources.java index c19d76efb35..0f100593e38 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocatableClusterResources.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocatableClusterResources.java @@ -112,6 +112,7 @@ public class AllocatableClusterResources { public ClusterSpec clusterSpec() { return clusterSpec; } + /** Returns the standard cost of these resources, in dollars per hour */ public double cost() { return nodes * advertisedResources.cost(); } /** @@ -135,6 +136,11 @@ public class AllocatableClusterResources { return this.cost() < other.cost(); // otherwise, prefer lower cost } + /** The estimated cost of changing from the given current resources to this. */ + public double costChangingFrom(AllocatableClusterResources current, ClusterModel clusterModel) { + return new ResourceChange(current, this, clusterModel).cost(); + } + @Override public String toString() { return advertisedResources() + diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocationOptimizer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocationOptimizer.java index 42bb16005ee..2511f17ee1a 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocationOptimizer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocationOptimizer.java @@ -7,6 +7,7 @@ import com.yahoo.config.provision.NodeResources; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.provisioning.NodeResourceLimits; +import java.time.Duration; import java.util.Optional; import static com.yahoo.vespa.hosted.provision.autoscale.Autoscaler.headroomRequiredToScaleDown; @@ -66,13 +67,27 @@ public class AllocationOptimizer { availableRealHostResources, nodeRepository); if (allocatableResources.isEmpty()) continue; - if (bestAllocation.isEmpty() || allocatableResources.get().preferableTo(bestAllocation.get())) + if (bestAllocation.isEmpty() || preferableTo(bestAllocation.get(), allocatableResources.get(), current, clusterModel)) bestAllocation = allocatableResources; } } return bestAllocation; } + private boolean preferableTo(AllocatableClusterResources best, AllocatableClusterResources considered, + AllocatableClusterResources current, ClusterModel clusterModel) { + if (best.fulfilment() < 1 || considered.fulfilment() < 1) // always fulfil as much as possible + return considered.fulfilment() > best.fulfilment(); + + return considered.cost() * toHours(clusterModel.allocationDuration()) + considered.costChangingFrom(current, clusterModel) + < + best.cost() * toHours(clusterModel.allocationDuration()) + best.costChangingFrom(current, clusterModel); + } + + private double toHours(Duration duration) { + return duration.toMillis() / 3600000.0; + } + /** Returns the max resources of a host one node may allocate. */ private NodeResources maxResourcesOf(NodeResources hostResources, ClusterModel clusterModel) { if (nodeRepository.exclusiveAllocation(clusterModel.clusterSpec())) return hostResources; diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java index 0d64d4fbb10..0bb8a4c3222 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java @@ -63,6 +63,7 @@ public class ClusterModel { private final Clock clock; private final Duration scalingDuration; + private final Duration allocationDuration; private final ClusterTimeseries clusterTimeseries; private final ClusterNodesTimeseries nodeTimeseries; private final Instant at; @@ -86,6 +87,7 @@ public class ClusterModel { this.nodes = clusterNodes; this.clock = clock; this.scalingDuration = cluster.scalingDuration(clusterSpec); + this.allocationDuration = cluster.allocationDuration(clusterSpec); this.clusterTimeseries = metricsDb.getClusterTimeseries(application.id(), cluster.id()); this.nodeTimeseries = new ClusterNodesTimeseries(scalingDuration(), cluster, nodes, metricsDb); this.at = clock.instant(); @@ -97,6 +99,7 @@ public class ClusterModel { Cluster cluster, Clock clock, Duration scalingDuration, + Duration allocationDuration, ClusterTimeseries clusterTimeseries, ClusterNodesTimeseries nodeTimeseries) { this.nodeRepository = nodeRepository; @@ -107,6 +110,7 @@ public class ClusterModel { this.clock = clock; this.scalingDuration = scalingDuration; + this.allocationDuration = allocationDuration; this.clusterTimeseries = clusterTimeseries; this.nodeTimeseries = nodeTimeseries; this.at = clock.instant(); @@ -127,6 +131,23 @@ public class ClusterModel { /** Returns the predicted duration of a rescaling of this cluster */ public Duration scalingDuration() { return scalingDuration; } + /** + * Returns the predicted duration of a resource change in this cluster, + * until we, or the application , will change it again. + */ + public Duration allocationDuration() { return allocationDuration; } + + /** Returns the predicted duration of data redistribution in this cluster. */ + public Duration redistributionDuration() { + if (clusterSpec.type().isContent()) return Duration.ofMinutes(0); + return scalingDuration(); // TODO: Estimate separately + } + + /** Returns the predicted duration of replacing all the nodes in this cluster. */ + public Duration nodeReplacementDuration() { + return Duration.ofMinutes(5); // TODO: Estimate? + } + /** Returns the average of the peak load measurement in each dimension, from each node. */ public Load peakLoad() { return nodeTimeseries().peakLoad(); @@ -137,6 +158,10 @@ public class ClusterModel { return loadWith(nodeCount(), groupCount()); } + public boolean isExclusive() { + return nodeRepository.exclusiveAllocation(clusterSpec); + } + /** Returns the relative load adjustment that should be made to this cluster given available measurements. */ public Load loadAdjustment() { if (nodeTimeseries().measurementsPerNode() < 0.5) return Load.one(); // Don't change based on very little data diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceChange.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceChange.java new file mode 100644 index 00000000000..3073c22aea7 --- /dev/null +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceChange.java @@ -0,0 +1,80 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.hosted.provision.autoscale; + +import com.yahoo.config.provision.ClusterSpec; +import com.yahoo.config.provision.NodeResources; + +import java.time.Duration; + +/** + * A resource change. + * + * @author bratseth + */ +public class ResourceChange { + + private final AllocatableClusterResources from, to; + private final ClusterModel clusterModel; + + public ResourceChange(AllocatableClusterResources from, AllocatableClusterResources to, ClusterModel clusterModel) { + this.from = from; + this.to = to; + this.clusterModel = clusterModel; + } + + /** Returns the estimated total cost of this resource change (coming in addition to the "to" resource cost). */ + public double cost() { + if (requiresRedistribution()) return toHours(clusterModel.redistributionDuration()) * from.cost(); + if (requiresNodeReplacement()) return toHours(clusterModel.nodeReplacementDuration()) * from.cost(); + return 0; + } + + private boolean requiresRedistribution() { + if ( ! clusterModel.clusterSpec().type().isContent()) return false; + if (from.nodes() != to.nodes()) return true; + if (from.groups() != to.groups()) return true; + if (requiresNodeReplacement()) return true; + return false; + } + + /** Returns true if the *existing* nodes of this needs to be replaced in this change. */ + private boolean requiresNodeReplacement() { + var fromNodes = from.advertisedResources().nodeResources(); + var toNodes = to.advertisedResources().nodeResources(); + + if (clusterModel.isExclusive()) { + return ! fromNodes.equals(toNodes); + } + else { + if ( ! fromNodes.justNonNumbers().equalsWhereSpecified(toNodes.justNonNumbers())) return true; + if ( ! canInPlaceResize()) return true; + return false; + } + } + + private double toHours(Duration duration) { + return duration.toMillis() / 3600000.0; + } + + private boolean canInPlaceResize() { + return canInPlaceResize(from.nodes(), from.advertisedResources().nodeResources(), + to.nodes(), to.advertisedResources().nodeResources(), + clusterModel.clusterSpec().type(), clusterModel.isExclusive(), from.groups() != to.groups()); + } + + public static boolean canInPlaceResize(int fromCount, NodeResources fromResources, + int toCount, NodeResources toResources, + ClusterSpec.Type type, boolean exclusive, boolean hasTopologyChange) { + if (exclusive) return false; // exclusive resources must match the host + + // Never allow in-place resize when also changing topology or decreasing cluster size + if (hasTopologyChange || toCount < fromCount) return false; + + // Do not allow increasing cluster size and decreasing node resources at the same time for content nodes + if (type.isContent() && toCount > fromCount && !toResources.satisfies(fromResources.justNumbers())) + return false; + + return true; + } + +} diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeSpec.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeSpec.java index cea0608013d..77f37cadc0b 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeSpec.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeSpec.java @@ -6,6 +6,7 @@ import com.yahoo.config.provision.ClusterSpec; import com.yahoo.config.provision.NodeResources; import com.yahoo.config.provision.NodeType; import com.yahoo.vespa.hosted.provision.Node; +import com.yahoo.vespa.hosted.provision.autoscale.ResourceChange; import java.time.Duration; import java.util.Map; @@ -162,16 +163,11 @@ public interface NodeSpec { @Override public boolean canResize(NodeResources currentNodeResources, NodeResources currentSpareHostResources, ClusterSpec.Type type, boolean hasTopologyChange, int currentClusterSize) { - if (exclusive) return false; // exclusive resources must match the host - // Never allow in-place resize when also changing topology or decreasing cluster size - if (hasTopologyChange || count < currentClusterSize) return false; + return ResourceChange.canInPlaceResize(currentClusterSize, currentNodeResources, count, requestedNodeResources, + type, exclusive, hasTopologyChange) + && + currentSpareHostResources.add(currentNodeResources.justNumbers()).satisfies(requestedNodeResources); - // Do not allow increasing cluster size and decreasing node resources at the same time for content nodes - if (type.isContent() && count > currentClusterSize && !requestedNodeResources.satisfies(currentNodeResources.justNumbers())) - return false; - - // Otherwise, allowed as long as the host can satisfy the new requested resources - return currentSpareHostResources.add(currentNodeResources.justNumbers()).satisfies(requestedNodeResources); } @Override diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModelTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModelTest.java index ec084014a6a..6477f2e34cb 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModelTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModelTest.java @@ -95,7 +95,7 @@ public class ClusterModelTest { application = application.with(cluster); return new ClusterModel(new ProvisioningTester.Builder().build().nodeRepository(), application.with(status), - clusterSpec, cluster, clock, Duration.ofMinutes(10), + clusterSpec, cluster, clock, Duration.ofMinutes(10), Duration.ofMinutes(5), timeseries(cluster,100, queryRate, writeRate, clock), ClusterNodesTimeseries.empty()); } |