aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorValerij Fredriksen <freva@users.noreply.github.com>2023-08-15 22:10:36 +0200
committerGitHub <noreply@github.com>2023-08-15 22:10:36 +0200
commit2fc7e4ee6646f29e398eef7f46af3454f32c992d (patch)
tree82319ae2c3ec11fb94f0ff0b9f1776aadd035f72
parent2ae55dbfddc73cfff8619b1735f52895afe1be9e (diff)
parent66ff658df11a001c0397db6756da0d3be2da905d (diff)
Merge pull request #28056 from vespa-engine/bratseth/consider-reallocation-costv8.212.18
Bratseth/consider reallocation cost
-rw-r--r--document/src/test/java/com/yahoo/document/DocumentTestCase.java2
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/Cluster.java10
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocatableResources.java (renamed from node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocatableClusterResources.java)116
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocationOptimizer.java46
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java42
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaling.java20
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java31
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceChange.java80
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java4
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeRepositoryProvisioner.java26
-rw-r--r--node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeSpec.java14
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java106
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModelTest.java28
-rw-r--r--node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Fixture.java33
14 files changed, 375 insertions, 183 deletions
diff --git a/document/src/test/java/com/yahoo/document/DocumentTestCase.java b/document/src/test/java/com/yahoo/document/DocumentTestCase.java
index 33b77cb1878..e5f6453c581 100644
--- a/document/src/test/java/com/yahoo/document/DocumentTestCase.java
+++ b/document/src/test/java/com/yahoo/document/DocumentTestCase.java
@@ -42,7 +42,7 @@ import static org.junit.Assert.fail;
/**
* Test for Document and all its features, including (de)serialization.
*
- * @author <a href="thomasg@yahoo-inc.com>Thomas Gundersen</a>
+ * @author Thomas Gundersen
* @author bratseth
*/
public class DocumentTestCase extends DocumentTestCaseBase {
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/Cluster.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/Cluster.java
index 1ca81df824b..796bc2eeb92 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/Cluster.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/Cluster.java
@@ -208,6 +208,16 @@ public class Cluster {
return minimum(ClusterModel.minScalingDuration(clusterSpec), totalDuration.dividedBy(completedEventCount));
}
+ /** The predicted time this cluster will stay in each resource configuration (including the scaling duration). */
+ public Duration allocationDuration(ClusterSpec clusterSpec) {
+ if (scalingEvents.size() < 2) return Duration.ofHours(12); // Default
+
+ long totalDurationMs = 0;
+ for (int i = 1; i < scalingEvents().size(); i++)
+ totalDurationMs += scalingEvents().get(i).at().toEpochMilli() - scalingEvents().get(i - 1).at().toEpochMilli();
+ return Duration.ofMillis(totalDurationMs / (scalingEvents.size() - 1));
+ }
+
private static Duration minimum(Duration smallestAllowed, Duration duration) {
if (duration.minus(smallestAllowed).isNegative())
return smallestAllowed;
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocatableClusterResources.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocatableResources.java
index c19d76efb35..8069c9c089b 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocatableClusterResources.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocatableResources.java
@@ -10,13 +10,14 @@ import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeList;
import com.yahoo.vespa.hosted.provision.NodeRepository;
+import java.time.Duration;
import java.util.List;
import java.util.Optional;
/**
* @author bratseth
*/
-public class AllocatableClusterResources {
+public class AllocatableResources {
/** The node count in the cluster */
private final int nodes;
@@ -32,9 +33,9 @@ public class AllocatableClusterResources {
private final double fulfilment;
/** Fake allocatable resources from requested capacity */
- public AllocatableClusterResources(ClusterResources requested,
- ClusterSpec clusterSpec,
- NodeRepository nodeRepository) {
+ public AllocatableResources(ClusterResources requested,
+ ClusterSpec clusterSpec,
+ NodeRepository nodeRepository) {
this.nodes = requested.nodes();
this.groups = requested.groups();
this.realResources = nodeRepository.resourcesCalculator().requestToReal(requested.nodeResources(), nodeRepository.exclusiveAllocation(clusterSpec), false);
@@ -43,7 +44,7 @@ public class AllocatableClusterResources {
this.fulfilment = 1;
}
- public AllocatableClusterResources(NodeList nodes, NodeRepository nodeRepository) {
+ public AllocatableResources(NodeList nodes, NodeRepository nodeRepository) {
this.nodes = nodes.size();
this.groups = (int)nodes.stream().map(node -> node.allocation().get().membership().cluster().group()).distinct().count();
this.realResources = averageRealResourcesOf(nodes.asList(), nodeRepository); // Average since we average metrics over nodes
@@ -52,10 +53,10 @@ public class AllocatableClusterResources {
this.fulfilment = 1;
}
- public AllocatableClusterResources(ClusterResources realResources,
- NodeResources advertisedResources,
- ClusterResources idealResources,
- ClusterSpec clusterSpec) {
+ public AllocatableResources(ClusterResources realResources,
+ NodeResources advertisedResources,
+ ClusterResources idealResources,
+ ClusterSpec clusterSpec) {
this.nodes = realResources.nodes();
this.groups = realResources.groups();
this.realResources = realResources.nodeResources();
@@ -64,12 +65,12 @@ public class AllocatableClusterResources {
this.fulfilment = fulfilment(realResources, idealResources);
}
- private AllocatableClusterResources(int nodes,
- int groups,
- NodeResources realResources,
- NodeResources advertisedResources,
- ClusterSpec clusterSpec,
- double fulfilment) {
+ private AllocatableResources(int nodes,
+ int groups,
+ NodeResources realResources,
+ NodeResources advertisedResources,
+ ClusterSpec clusterSpec,
+ double fulfilment) {
this.nodes = nodes;
this.groups = groups;
this.realResources = realResources;
@@ -79,16 +80,16 @@ public class AllocatableClusterResources {
}
/** Returns this with the redundant node or group removed from counts. */
- public AllocatableClusterResources withoutRedundancy() {
+ public AllocatableResources withoutRedundancy() {
int groupSize = nodes / groups;
int nodesAdjustedForRedundancy = nodes > 1 ? (groups == 1 ? nodes - 1 : nodes - groupSize) : nodes;
int groupsAdjustedForRedundancy = nodes > 1 ? (groups == 1 ? 1 : groups - 1) : groups;
- return new AllocatableClusterResources(nodesAdjustedForRedundancy,
- groupsAdjustedForRedundancy,
- realResources,
- advertisedResources,
- clusterSpec,
- fulfilment);
+ return new AllocatableResources(nodesAdjustedForRedundancy,
+ groupsAdjustedForRedundancy,
+ realResources,
+ advertisedResources,
+ clusterSpec,
+ fulfilment);
}
/**
@@ -112,6 +113,7 @@ public class AllocatableClusterResources {
public ClusterSpec clusterSpec() { return clusterSpec; }
+ /** Returns the standard cost of these resources, in dollars per hour */
public double cost() { return nodes * advertisedResources.cost(); }
/**
@@ -128,11 +130,22 @@ public class AllocatableClusterResources {
return (vcpuFulfilment + memoryGbFulfilment + diskGbFulfilment) / 3;
}
- public boolean preferableTo(AllocatableClusterResources other) {
- if (this.fulfilment < 1 || other.fulfilment < 1) // always fulfil as much as possible
- return this.fulfilment > other.fulfilment;
+ public boolean preferableTo(AllocatableResources other, ClusterModel model) {
+ if (other.fulfilment() < 1 || this.fulfilment() < 1) // always fulfil as much as possible
+ return this.fulfilment() > other.fulfilment();
- return this.cost() < other.cost(); // otherwise, prefer lower cost
+ return this.cost() * toHours(model.allocationDuration()) + this.costChangingFrom(model)
+ <
+ other.cost() * toHours(model.allocationDuration()) + other.costChangingFrom(model);
+ }
+
+ private double toHours(Duration duration) {
+ return duration.toMillis() / 3600000.0;
+ }
+
+ /** The estimated cost of changing from the given current resources to this. */
+ public double costChangingFrom(ClusterModel model) {
+ return new ResourceChange(model, this).cost();
}
@Override
@@ -154,12 +167,13 @@ public class AllocatableClusterResources {
.withBandwidthGbps(sum.bandwidthGbps() / nodes.size());
}
- public static Optional<AllocatableClusterResources> from(ClusterResources wantedResources,
- ApplicationId applicationId,
- ClusterSpec clusterSpec,
- Limits applicationLimits,
- List<NodeResources> availableRealHostResources,
- NodeRepository nodeRepository) {
+ public static Optional<AllocatableResources> from(ClusterResources wantedResources,
+ ApplicationId applicationId,
+ ClusterSpec clusterSpec,
+ Limits applicationLimits,
+ List<NodeResources> availableRealHostResources,
+ ClusterModel model,
+ NodeRepository nodeRepository) {
var systemLimits = nodeRepository.nodeResourceLimits();
boolean exclusive = nodeRepository.exclusiveAllocation(clusterSpec);
if (! exclusive) {
@@ -193,8 +207,8 @@ public class AllocatableClusterResources {
}
else { // Return the cheapest flavor satisfying the requested resources, if any
NodeResources cappedWantedResources = applicationLimits.cap(wantedResources.nodeResources());
- Optional<AllocatableClusterResources> best = Optional.empty();
- Optional<AllocatableClusterResources> bestDisregardingDiskLimit = Optional.empty();
+ Optional<AllocatableResources> best = Optional.empty();
+ Optional<AllocatableResources> bestDisregardingDiskLimit = Optional.empty();
for (Flavor flavor : nodeRepository.flavors().getFlavors()) {
// Flavor decide resources: Real resources are the worst case real resources we'll get if we ask for these advertised resources
NodeResources advertisedResources = nodeRepository.resourcesCalculator().advertisedResourcesOf(flavor);
@@ -216,18 +230,18 @@ public class AllocatableClusterResources {
if ( ! between(applicationLimits.min().nodeResources(), applicationLimits.max().nodeResources(), advertisedResources)) continue;
if ( ! systemLimits.isWithinRealLimits(realResources, applicationId, clusterSpec)) continue;
- var candidate = new AllocatableClusterResources(wantedResources.with(realResources),
- advertisedResources,
- wantedResources,
- clusterSpec);
+ var candidate = new AllocatableResources(wantedResources.with(realResources),
+ advertisedResources,
+ wantedResources,
+ clusterSpec);
if ( ! systemLimits.isWithinAdvertisedDiskLimits(advertisedResources, clusterSpec)) { // TODO: Remove when disk limit is enforced
- if (bestDisregardingDiskLimit.isEmpty() || candidate.preferableTo(bestDisregardingDiskLimit.get())) {
+ if (bestDisregardingDiskLimit.isEmpty() || candidate.preferableTo(bestDisregardingDiskLimit.get(), model)) {
bestDisregardingDiskLimit = Optional.of(candidate);
}
continue;
}
- if (best.isEmpty() || candidate.preferableTo(best.get())) {
+ if (best.isEmpty() || candidate.preferableTo(best.get(), model)) {
best = Optional.of(candidate);
}
}
@@ -237,13 +251,13 @@ public class AllocatableClusterResources {
}
}
- private static AllocatableClusterResources calculateAllocatableResources(ClusterResources wantedResources,
- NodeRepository nodeRepository,
- ApplicationId applicationId,
- ClusterSpec clusterSpec,
- Limits applicationLimits,
- boolean exclusive,
- boolean bestCase) {
+ private static AllocatableResources calculateAllocatableResources(ClusterResources wantedResources,
+ NodeRepository nodeRepository,
+ ApplicationId applicationId,
+ ClusterSpec clusterSpec,
+ Limits applicationLimits,
+ boolean exclusive,
+ boolean bestCase) {
var systemLimits = nodeRepository.nodeResourceLimits();
var advertisedResources = nodeRepository.resourcesCalculator().realToRequest(wantedResources.nodeResources(), exclusive, bestCase);
advertisedResources = systemLimits.enlargeToLegal(advertisedResources, applicationId, clusterSpec, exclusive, true); // Ask for something legal
@@ -255,10 +269,10 @@ public class AllocatableClusterResources {
advertisedResources = advertisedResources.with(NodeResources.StorageType.remote);
realResources = nodeRepository.resourcesCalculator().requestToReal(advertisedResources, exclusive, bestCase);
}
- return new AllocatableClusterResources(wantedResources.with(realResources),
- advertisedResources,
- wantedResources,
- clusterSpec);
+ return new AllocatableResources(wantedResources.with(realResources),
+ advertisedResources,
+ wantedResources,
+ clusterSpec);
}
/** Returns true if the given resources could be allocated on any of the given host flavors */
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocationOptimizer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocationOptimizer.java
index 42bb16005ee..f650d8ec269 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocationOptimizer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/AllocationOptimizer.java
@@ -5,7 +5,6 @@ import com.yahoo.config.provision.ClusterResources;
import com.yahoo.config.provision.IntRange;
import com.yahoo.config.provision.NodeResources;
import com.yahoo.vespa.hosted.provision.NodeRepository;
-import com.yahoo.vespa.hosted.provision.provisioning.NodeResourceLimits;
import java.util.Optional;
@@ -35,21 +34,20 @@ public class AllocationOptimizer {
* @return the best allocation, if there are any possible legal allocations, fulfilling the target
* fully or partially, within the limits
*/
- public Optional<AllocatableClusterResources> findBestAllocation(Load loadAdjustment,
- AllocatableClusterResources current,
- ClusterModel clusterModel,
- Limits limits) {
+ public Optional<AllocatableResources> findBestAllocation(Load loadAdjustment,
+ ClusterModel model,
+ Limits limits) {
if (limits.isEmpty())
limits = Limits.of(new ClusterResources(minimumNodes, 1, NodeResources.unspecified()),
new ClusterResources(maximumNodes, maximumNodes, NodeResources.unspecified()),
IntRange.empty());
else
- limits = atLeast(minimumNodes, limits).fullySpecified(current.clusterSpec(), nodeRepository, clusterModel.application().id());
- Optional<AllocatableClusterResources> bestAllocation = Optional.empty();
+ limits = atLeast(minimumNodes, limits).fullySpecified(model.current().clusterSpec(), nodeRepository, model.application().id());
+ Optional<AllocatableResources> bestAllocation = Optional.empty();
var availableRealHostResources = nodeRepository.zone().cloud().dynamicProvisioning()
? nodeRepository.flavors().getFlavors().stream().map(flavor -> flavor.resources()).toList()
: nodeRepository.nodes().list().hosts().stream().map(host -> host.flavor().resources())
- .map(hostResources -> maxResourcesOf(hostResources, clusterModel))
+ .map(hostResources -> maxResourcesOf(hostResources, model))
.toList();
for (int groups = limits.min().groups(); groups <= limits.max().groups(); groups++) {
for (int nodes = limits.min().nodes(); nodes <= limits.max().nodes(); nodes++) {
@@ -58,15 +56,16 @@ public class AllocationOptimizer {
var resources = new ClusterResources(nodes,
groups,
nodeResourcesWith(nodes, groups,
- limits, loadAdjustment, current, clusterModel));
- var allocatableResources = AllocatableClusterResources.from(resources,
- clusterModel.application().id(),
- current.clusterSpec(),
- limits,
- availableRealHostResources,
- nodeRepository);
+ limits, loadAdjustment, model));
+ var allocatableResources = AllocatableResources.from(resources,
+ model.application().id(),
+ model.current().clusterSpec(),
+ limits,
+ availableRealHostResources,
+ model,
+ nodeRepository);
if (allocatableResources.isEmpty()) continue;
- if (bestAllocation.isEmpty() || allocatableResources.get().preferableTo(bestAllocation.get()))
+ if (bestAllocation.isEmpty() || allocatableResources.get().preferableTo(bestAllocation.get(), model))
bestAllocation = allocatableResources;
}
}
@@ -74,8 +73,8 @@ public class AllocationOptimizer {
}
/** Returns the max resources of a host one node may allocate. */
- private NodeResources maxResourcesOf(NodeResources hostResources, ClusterModel clusterModel) {
- if (nodeRepository.exclusiveAllocation(clusterModel.clusterSpec())) return hostResources;
+ private NodeResources maxResourcesOf(NodeResources hostResources, ClusterModel model) {
+ if (nodeRepository.exclusiveAllocation(model.clusterSpec())) return hostResources;
// static, shared hosts: Allocate at most half of the host cpu to simplify management
return hostResources.withVcpu(hostResources.vcpu() / 2);
}
@@ -88,9 +87,8 @@ public class AllocationOptimizer {
int groups,
Limits limits,
Load loadAdjustment,
- AllocatableClusterResources current,
- ClusterModel clusterModel) {
- var loadWithTarget = clusterModel.loadAdjustmentWith(nodes, groups, loadAdjustment);
+ ClusterModel model) {
+ var loadWithTarget = model.loadAdjustmentWith(nodes, groups, loadAdjustment);
// Leave some headroom above the ideal allocation to avoid immediately needing to scale back up
if (loadAdjustment.cpu() < 1 && (1.0 - loadWithTarget.cpu()) < headroomRequiredToScaleDown)
@@ -100,11 +98,11 @@ public class AllocationOptimizer {
if (loadAdjustment.disk() < 1 && (1.0 - loadWithTarget.disk()) < headroomRequiredToScaleDown)
loadAdjustment = loadAdjustment.withDisk(Math.min(1.0, loadAdjustment.disk() * (1.0 + headroomRequiredToScaleDown)));
- loadWithTarget = clusterModel.loadAdjustmentWith(nodes, groups, loadAdjustment);
+ loadWithTarget = model.loadAdjustmentWith(nodes, groups, loadAdjustment);
- var scaled = loadWithTarget.scaled(current.realResources().nodeResources());
+ var scaled = loadWithTarget.scaled(model.current().realResources().nodeResources());
var nonScaled = limits.isEmpty() || limits.min().nodeResources().isUnspecified()
- ? current.advertisedResources().nodeResources()
+ ? model.current().advertisedResources().nodeResources()
: limits.min().nodeResources(); // min=max for non-scaled
return nonScaled.withVcpu(scaled.vcpu()).withMemoryGb(scaled.memoryGb()).withDiskGb(scaled.diskGb());
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java
index 32b59319a88..b5f86be68f6 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java
@@ -54,40 +54,40 @@ public class Autoscaler {
}
private Autoscaling autoscale(Application application, Cluster cluster, NodeList clusterNodes, Limits limits) {
- ClusterModel clusterModel = new ClusterModel(nodeRepository,
- application,
- clusterNodes.not().retired().clusterSpec(),
- cluster,
- clusterNodes,
- nodeRepository.metricsDb(),
- nodeRepository.clock());
- if (clusterModel.isEmpty()) return Autoscaling.empty();
+ var model = new ClusterModel(nodeRepository,
+ application,
+ clusterNodes.not().retired().clusterSpec(),
+ cluster,
+ clusterNodes,
+ new AllocatableResources(clusterNodes.not().retired(), nodeRepository),
+ nodeRepository.metricsDb(),
+ nodeRepository.clock());
+ if (model.isEmpty()) return Autoscaling.empty();
if (! limits.isEmpty() && cluster.minResources().equals(cluster.maxResources()))
- return Autoscaling.dontScale(Autoscaling.Status.unavailable, "Autoscaling is not enabled", clusterModel);
+ return Autoscaling.dontScale(Autoscaling.Status.unavailable, "Autoscaling is not enabled", model);
- if ( ! clusterModel.isStable(nodeRepository))
- return Autoscaling.dontScale(Status.waiting, "Cluster change in progress", clusterModel);
+ if ( ! model.isStable(nodeRepository))
+ return Autoscaling.dontScale(Status.waiting, "Cluster change in progress", model);
- var current = new AllocatableClusterResources(clusterNodes.not().retired(), nodeRepository);
- var loadAdjustment = clusterModel.loadAdjustment();
+ var loadAdjustment = model.loadAdjustment();
// Ensure we only scale down if we'll have enough headroom to not scale up again given a small load increase
- var target = allocationOptimizer.findBestAllocation(loadAdjustment, current, clusterModel, limits);
+ var target = allocationOptimizer.findBestAllocation(loadAdjustment, model, limits);
if (target.isEmpty())
- return Autoscaling.dontScale(Status.insufficient, "No allocations are possible within configured limits", clusterModel);
+ return Autoscaling.dontScale(Status.insufficient, "No allocations are possible within configured limits", model);
- if (! worthRescaling(current.realResources(), target.get().realResources())) {
+ if (! worthRescaling(model.current().realResources(), target.get().realResources())) {
if (target.get().fulfilment() < 0.9999999)
- return Autoscaling.dontScale(Status.insufficient, "Configured limits prevents ideal scaling of this cluster", clusterModel);
- else if ( ! clusterModel.safeToScaleDown() && clusterModel.idealLoad().any(v -> v < 1.0))
- return Autoscaling.dontScale(Status.ideal, "Cooling off before considering to scale down", clusterModel);
+ return Autoscaling.dontScale(Status.insufficient, "Configured limits prevents ideal scaling of this cluster", model);
+ else if ( ! model.safeToScaleDown() && model.idealLoad().any(v -> v < 1.0))
+ return Autoscaling.dontScale(Status.ideal, "Cooling off before considering to scale down", model);
else
- return Autoscaling.dontScale(Status.ideal, "Cluster is ideally scaled (within configured limits)", clusterModel);
+ return Autoscaling.dontScale(Status.ideal, "Cluster is ideally scaled (within configured limits)", model);
}
- return Autoscaling.scaleTo(target.get().advertisedResources(), clusterModel);
+ return Autoscaling.scaleTo(target.get().advertisedResources(), model);
}
/** Returns true if it is worthwhile to make the given resource change, false if it is too insignificant */
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaling.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaling.java
index 0c86108b36c..fad280d6c29 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaling.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaling.java
@@ -120,25 +120,25 @@ public class Autoscaling {
}
/** Creates an autoscaling conclusion which does not change the current allocation for a specified reason. */
- public static Autoscaling dontScale(Status status, String description, ClusterModel clusterModel) {
+ public static Autoscaling dontScale(Status status, String description, ClusterModel model) {
return new Autoscaling(status,
description,
Optional.empty(),
- clusterModel.at(),
- clusterModel.peakLoad(),
- clusterModel.idealLoad(),
- clusterModel.metrics());
+ model.at(),
+ model.peakLoad(),
+ model.idealLoad(),
+ model.metrics());
}
/** Creates an autoscaling conclusion to scale. */
- public static Autoscaling scaleTo(ClusterResources target, ClusterModel clusterModel) {
+ public static Autoscaling scaleTo(ClusterResources target, ClusterModel model) {
return new Autoscaling(Status.rescaling,
"Rescaling initiated due to load changes",
Optional.of(target),
- clusterModel.at(),
- clusterModel.peakLoad(),
- clusterModel.idealLoad(),
- clusterModel.metrics());
+ model.at(),
+ model.peakLoad(),
+ model.idealLoad(),
+ model.metrics());
}
public enum Status {
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java
index 0d64d4fbb10..7c2f3a563fb 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java
@@ -50,6 +50,7 @@ public class ClusterModel {
private final Application application;
private final ClusterSpec clusterSpec;
private final Cluster cluster;
+ private final AllocatableResources current;
private final CpuModel cpu = new CpuModel();
private final MemoryModel memory = new MemoryModel();
@@ -63,6 +64,7 @@ public class ClusterModel {
private final Clock clock;
private final Duration scalingDuration;
+ private final Duration allocationDuration;
private final ClusterTimeseries clusterTimeseries;
private final ClusterNodesTimeseries nodeTimeseries;
private final Instant at;
@@ -77,6 +79,7 @@ public class ClusterModel {
ClusterSpec clusterSpec,
Cluster cluster,
NodeList clusterNodes,
+ AllocatableResources current,
MetricsDb metricsDb,
Clock clock) {
this.nodeRepository = nodeRepository;
@@ -84,8 +87,10 @@ public class ClusterModel {
this.clusterSpec = clusterSpec;
this.cluster = cluster;
this.nodes = clusterNodes;
+ this.current = current;
this.clock = clock;
this.scalingDuration = cluster.scalingDuration(clusterSpec);
+ this.allocationDuration = cluster.allocationDuration(clusterSpec);
this.clusterTimeseries = metricsDb.getClusterTimeseries(application.id(), cluster.id());
this.nodeTimeseries = new ClusterNodesTimeseries(scalingDuration(), cluster, nodes, metricsDb);
this.at = clock.instant();
@@ -95,8 +100,10 @@ public class ClusterModel {
Application application,
ClusterSpec clusterSpec,
Cluster cluster,
+ AllocatableResources current,
Clock clock,
Duration scalingDuration,
+ Duration allocationDuration,
ClusterTimeseries clusterTimeseries,
ClusterNodesTimeseries nodeTimeseries) {
this.nodeRepository = nodeRepository;
@@ -104,9 +111,11 @@ public class ClusterModel {
this.clusterSpec = clusterSpec;
this.cluster = cluster;
this.nodes = NodeList.of();
+ this.current = current;
this.clock = clock;
this.scalingDuration = scalingDuration;
+ this.allocationDuration = allocationDuration;
this.clusterTimeseries = clusterTimeseries;
this.nodeTimeseries = nodeTimeseries;
this.at = clock.instant();
@@ -114,6 +123,7 @@ public class ClusterModel {
public Application application() { return application; }
public ClusterSpec clusterSpec() { return clusterSpec; }
+ public AllocatableResources current() { return current; }
private ClusterNodesTimeseries nodeTimeseries() { return nodeTimeseries; }
private ClusterTimeseries clusterTimeseries() { return clusterTimeseries; }
@@ -127,6 +137,23 @@ public class ClusterModel {
/** Returns the predicted duration of a rescaling of this cluster */
public Duration scalingDuration() { return scalingDuration; }
+ /**
+ * Returns the predicted duration of a resource change in this cluster,
+ * until we, or the application , will change it again.
+ */
+ public Duration allocationDuration() { return allocationDuration; }
+
+ /** Returns the predicted duration of data redistribution in this cluster. */
+ public Duration redistributionDuration() {
+ if (! clusterSpec.type().isContent()) return Duration.ofMinutes(0);
+ return scalingDuration(); // TODO: Estimate separately
+ }
+
+ /** Returns the predicted duration of replacing all the nodes in this cluster. */
+ public Duration nodeReplacementDuration() {
+ return Duration.ofMinutes(5); // TODO: Estimate?
+ }
+
/** Returns the average of the peak load measurement in each dimension, from each node. */
public Load peakLoad() {
return nodeTimeseries().peakLoad();
@@ -137,6 +164,10 @@ public class ClusterModel {
return loadWith(nodeCount(), groupCount());
}
+ public boolean isExclusive() {
+ return nodeRepository.exclusiveAllocation(clusterSpec);
+ }
+
/** Returns the relative load adjustment that should be made to this cluster given available measurements. */
public Load loadAdjustment() {
if (nodeTimeseries().measurementsPerNode() < 0.5) return Load.one(); // Don't change based on very little data
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceChange.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceChange.java
new file mode 100644
index 00000000000..c0cbb40d992
--- /dev/null
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceChange.java
@@ -0,0 +1,80 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.vespa.hosted.provision.autoscale;
+
+import com.yahoo.config.provision.ClusterSpec;
+import com.yahoo.config.provision.NodeResources;
+
+import java.time.Duration;
+
+/**
+ * A resource change.
+ *
+ * @author bratseth
+ */
+public class ResourceChange {
+
+ private final AllocatableResources from, to;
+ private final ClusterModel model;
+
+ public ResourceChange(ClusterModel model, AllocatableResources to) {
+ this.from = model.current();
+ this.to = to;
+ this.model = model;
+ }
+
+ /** Returns the estimated total cost of this resource change (coming in addition to the "to" resource cost). */
+ public double cost() {
+ if (requiresRedistribution()) return toHours(model.redistributionDuration()) * from.cost();
+ if (requiresNodeReplacement()) return toHours(model.nodeReplacementDuration()) * from.cost();
+ return 0;
+ }
+
+ private boolean requiresRedistribution() {
+ if ( ! model.clusterSpec().type().isContent()) return false;
+ if (from.nodes() != to.nodes()) return true;
+ if (from.groups() != to.groups()) return true;
+ if (requiresNodeReplacement()) return true;
+ return false;
+ }
+
+ /** Returns true if the *existing* nodes of this needs to be replaced in this change. */
+ private boolean requiresNodeReplacement() {
+ var fromNodes = from.advertisedResources().nodeResources();
+ var toNodes = to.advertisedResources().nodeResources();
+
+ if (model.isExclusive()) {
+ return ! fromNodes.equals(toNodes);
+ }
+ else {
+ if ( ! fromNodes.justNonNumbers().equalsWhereSpecified(toNodes.justNonNumbers())) return true;
+ if ( ! canInPlaceResize()) return true;
+ return false;
+ }
+ }
+
+ private double toHours(Duration duration) {
+ return duration.toMillis() / 3600000.0;
+ }
+
+ private boolean canInPlaceResize() {
+ return canInPlaceResize(from.nodes(), from.advertisedResources().nodeResources(),
+ to.nodes(), to.advertisedResources().nodeResources(),
+ model.clusterSpec().type(), model.isExclusive(), from.groups() != to.groups());
+ }
+
+ public static boolean canInPlaceResize(int fromCount, NodeResources fromResources,
+ int toCount, NodeResources toResources,
+ ClusterSpec.Type type, boolean exclusive, boolean hasTopologyChange) {
+ if (exclusive) return false; // exclusive resources must match the host
+
+ // Never allow in-place resize when also changing topology or decreasing cluster size
+ if (hasTopologyChange || toCount < fromCount) return false;
+
+ // Do not allow increasing cluster size and decreasing node resources at the same time for content nodes
+ if (type.isContent() && toCount > fromCount && !toResources.satisfies(fromResources.justNumbers()))
+ return false;
+
+ return true;
+ }
+
+}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java
index 92f86325cf7..6a01a2bcd18 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/maintenance/AutoscalingMaintainer.java
@@ -16,7 +16,7 @@ import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.applications.Application;
import com.yahoo.vespa.hosted.provision.applications.Applications;
import com.yahoo.vespa.hosted.provision.applications.Cluster;
-import com.yahoo.vespa.hosted.provision.autoscale.AllocatableClusterResources;
+import com.yahoo.vespa.hosted.provision.autoscale.AllocatableResources;
import com.yahoo.vespa.hosted.provision.autoscale.Autoscaler;
import com.yahoo.vespa.hosted.provision.autoscale.Autoscaling;
import com.yahoo.vespa.hosted.provision.autoscale.NodeMetricSnapshot;
@@ -87,7 +87,7 @@ public class AutoscalingMaintainer extends NodeRepositoryMaintainer {
NodeList clusterNodes = nodeRepository().nodes().list(Node.State.active).owner(applicationId).cluster(clusterId);
cluster = updateCompletion(cluster, clusterNodes);
- var current = new AllocatableClusterResources(clusterNodes.not().retired(), nodeRepository()).advertisedResources();
+ var current = new AllocatableResources(clusterNodes.not().retired(), nodeRepository()).advertisedResources();
// Autoscale unless an autoscaling is already in progress
Autoscaling autoscaling = null;
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeRepositoryProvisioner.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeRepositoryProvisioner.java
index 3d0c1069584..a67a513550a 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeRepositoryProvisioner.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeRepositoryProvisioner.java
@@ -23,7 +23,7 @@ import com.yahoo.vespa.hosted.provision.NodeList;
import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.applications.Application;
import com.yahoo.vespa.hosted.provision.applications.Cluster;
-import com.yahoo.vespa.hosted.provision.autoscale.AllocatableClusterResources;
+import com.yahoo.vespa.hosted.provision.autoscale.AllocatableResources;
import com.yahoo.vespa.hosted.provision.autoscale.AllocationOptimizer;
import com.yahoo.vespa.hosted.provision.autoscale.ClusterModel;
import com.yahoo.vespa.hosted.provision.autoscale.Limits;
@@ -182,12 +182,12 @@ public class NodeRepositoryProvisioner implements Provisioner {
.not().retired()
.not().removable();
boolean firstDeployment = nodes.isEmpty();
- AllocatableClusterResources currentResources =
+ var current =
firstDeployment // start at min, preserve current resources otherwise
- ? new AllocatableClusterResources(initialResourcesFrom(requested, clusterSpec, application.id()), clusterSpec, nodeRepository)
- : new AllocatableClusterResources(nodes, nodeRepository);
- var clusterModel = new ClusterModel(nodeRepository, application, clusterSpec, cluster, nodes, nodeRepository.metricsDb(), nodeRepository.clock());
- return within(Limits.of(requested), currentResources, firstDeployment, clusterModel);
+ ? new AllocatableResources(initialResourcesFrom(requested, clusterSpec, application.id()), clusterSpec, nodeRepository)
+ : new AllocatableResources(nodes, nodeRepository);
+ var model = new ClusterModel(nodeRepository, application, clusterSpec, cluster, nodes, current, nodeRepository.metricsDb(), nodeRepository.clock());
+ return within(Limits.of(requested), model, firstDeployment);
}
private ClusterResources initialResourcesFrom(Capacity requested, ClusterSpec clusterSpec, ApplicationId applicationId) {
@@ -197,21 +197,19 @@ public class NodeRepositoryProvisioner implements Provisioner {
/** Make the minimal adjustments needed to the current resources to stay within the limits */
private ClusterResources within(Limits limits,
- AllocatableClusterResources current,
- boolean firstDeployment,
- ClusterModel clusterModel) {
+ ClusterModel model,
+ boolean firstDeployment) {
if (limits.min().equals(limits.max())) return limits.min();
// Don't change current deployments that are still legal
- if (! firstDeployment && current.advertisedResources().isWithin(limits.min(), limits.max()))
- return current.advertisedResources();
+ if (! firstDeployment && model.current().advertisedResources().isWithin(limits.min(), limits.max()))
+ return model.current().advertisedResources();
// Otherwise, find an allocation that preserves the current resources as well as possible
return allocationOptimizer.findBestAllocation(Load.one(),
- current,
- clusterModel,
+ model,
limits)
- .orElseThrow(() -> newNoAllocationPossible(current.clusterSpec(), limits))
+ .orElseThrow(() -> newNoAllocationPossible(model.current().clusterSpec(), limits))
.advertisedResources();
}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeSpec.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeSpec.java
index cea0608013d..77f37cadc0b 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeSpec.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/NodeSpec.java
@@ -6,6 +6,7 @@ import com.yahoo.config.provision.ClusterSpec;
import com.yahoo.config.provision.NodeResources;
import com.yahoo.config.provision.NodeType;
import com.yahoo.vespa.hosted.provision.Node;
+import com.yahoo.vespa.hosted.provision.autoscale.ResourceChange;
import java.time.Duration;
import java.util.Map;
@@ -162,16 +163,11 @@ public interface NodeSpec {
@Override
public boolean canResize(NodeResources currentNodeResources, NodeResources currentSpareHostResources,
ClusterSpec.Type type, boolean hasTopologyChange, int currentClusterSize) {
- if (exclusive) return false; // exclusive resources must match the host
- // Never allow in-place resize when also changing topology or decreasing cluster size
- if (hasTopologyChange || count < currentClusterSize) return false;
+ return ResourceChange.canInPlaceResize(currentClusterSize, currentNodeResources, count, requestedNodeResources,
+ type, exclusive, hasTopologyChange)
+ &&
+ currentSpareHostResources.add(currentNodeResources.justNumbers()).satisfies(requestedNodeResources);
- // Do not allow increasing cluster size and decreasing node resources at the same time for content nodes
- if (type.isContent() && count > currentClusterSize && !requestedNodeResources.satisfies(currentNodeResources.justNumbers()))
- return false;
-
- // Otherwise, allowed as long as the host can satisfy the new requested resources
- return currentSpareHostResources.add(currentNodeResources.justNumbers()).satisfies(requestedNodeResources);
}
@Override
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java
index d33857d1a1e..c7171b6b478 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTest.java
@@ -6,6 +6,7 @@ import com.yahoo.config.provision.ClusterInfo;
import com.yahoo.config.provision.ClusterResources;
import com.yahoo.config.provision.ClusterSpec;
import com.yahoo.config.provision.Environment;
+import com.yahoo.config.provision.Flavor;
import com.yahoo.config.provision.IntRange;
import com.yahoo.config.provision.NodeResources;
import com.yahoo.config.provision.NodeResources.DiskSpeed;
@@ -18,6 +19,7 @@ import com.yahoo.vespa.hosted.provision.provisioning.DynamicProvisioningTester;
import org.junit.Test;
import java.time.Duration;
+import java.util.List;
import java.util.Optional;
import static com.yahoo.config.provision.NodeResources.DiskSpeed.fast;
@@ -88,7 +90,7 @@ public class AutoscalingTest {
fixture.tester().clock().advance(Duration.ofDays(7));
fixture.loader().applyCpuLoad(0.1f, 10);
fixture.tester().assertResources("Scaling cpu down since usage has gone down significantly",
- 6, 1, 1.1, 9.8, 390.2,
+ 9, 1, 1.0, 6.5, 243.9,
fixture.autoscale());
}
@@ -173,7 +175,7 @@ public class AutoscalingTest {
fixture.setScalingDuration(Duration.ofHours(12)); // Fixture sets last completion to be 1 day into the past
fixture.loader().applyLoad(new Load(1.0, 0.1, 1.0), 10);
fixture.tester().assertResources("Scaling up (only) since resource usage is too high",
- 8, 1, 7.1, 9.3, 75.4,
+ 5, 1, 11.7, 15.4, 132.0,
fixture.autoscale());
}
@@ -185,7 +187,7 @@ public class AutoscalingTest {
fixture.tester.clock().advance(Duration.ofDays(2));
fixture.loader().applyLoad(new Load(1.0, 0.1, 1.0), 10);
fixture.tester().assertResources("Scaling cpu and disk up and memory down",
- 7, 1, 8.2, 4.0, 88.0,
+ 5, 1, 11.7, 4.0, 132.0,
fixture.autoscale());
}
@@ -208,7 +210,7 @@ public class AutoscalingTest {
fixture.loader().applyCpuLoad(0.70, 1);
fixture.loader().applyCpuLoad(0.01, 100);
fixture.tester().assertResources("Scaling up since peak resource usage is too high",
- 8, 1, 4.3, 7.4, 29.0,
+ 5, 1, 7.1, 12.3, 50.7,
fixture.autoscale());
}
@@ -232,7 +234,7 @@ public class AutoscalingTest {
fixture.loader().applyCpuLoad(0.70, 1);
fixture.loader().applyCpuLoad(0.01, 100);
fixture.tester().assertResources("Scaling up cpu since peak resource usage is too high",
- 8, 1, 4.3, 7.7, 34.3,
+ 5, 1, 7.1, 12.8, 60.0,
fixture.autoscale());
}
@@ -393,11 +395,10 @@ public class AutoscalingTest {
.initialResources(Optional.of(now))
.capacity(Capacity.from(min, max))
.build();
- fixture.setScalingDuration(Duration.ofHours(6));
fixture.tester().clock().advance(Duration.ofDays(2));
- fixture.loader().applyCpuLoad(0.4, 240);
+ fixture.loader().applyCpuLoad(0.5, 240);
fixture.tester().assertResources("Scaling cpu up",
- 6, 6, 5.0, 7.4, 22.3,
+ 6, 6, 4.5, 7.4, 22.3,
fixture.autoscale());
}
@@ -460,7 +461,7 @@ public class AutoscalingTest {
fixture.tester().clock().advance(Duration.ofDays(2));
fixture.loader().applyCpuLoad(1.0, 120);
fixture.tester().assertResources("Suggesting above capacity limit",
- 8, 1, 6.2, 7.4, 29.0,
+ 5, 1, 10.2, 12.3, 50.7,
fixture.tester().suggest(fixture.applicationId, fixture.clusterSpec.id(), min, min));
}
@@ -520,7 +521,7 @@ public class AutoscalingTest {
fixture.tester().clock().advance(Duration.ofDays(2));
fixture.loader().applyLoad(new Load(0.5, 0.8, 0.1), 120);
fixture.tester().assertResources("Suggesting resources where disk is 3x memory (this is a content cluster)",
- 11, 1, 13.0, 60.0, 179.9,
+ 10, 1, 14.3, 66.2, 198.6,
fixture.tester().suggest(fixture.applicationId, fixture.clusterSpec.id(), min, min));
fixture.tester().assertResources("Autoscaling to resources where disk is 3x memory (this is a content cluster)",
10, 1, 10.0, 66.2, 198.6,
@@ -593,13 +594,12 @@ public class AutoscalingTest {
.initialResources(Optional.of(now))
.capacity(Capacity.from(min, max))
.build();
- fixture.setScalingDuration(Duration.ofHours(6));
fixture.tester().clock().advance(Duration.ofDays(2));
Duration timePassed = fixture.loader().addCpuMeasurements(0.25, 120);
fixture.tester().clock().advance(timePassed.negated());
fixture.loader().addLoadMeasurements(10, t -> t == 0 ? 200.0 : 100.0, t -> 10.0);
- fixture.tester().assertResources("Scaling up cpu, others down, changing to 1 group is cheaper",
- 7, 1, 3.2, 43.3, 129.8,
+ fixture.tester().assertResources("Changing to 1 group is cheaper",
+ 7, 1, 2.5, 43.3, 129.8,
fixture.autoscale());
}
@@ -650,11 +650,10 @@ public class AutoscalingTest {
.initialResources(Optional.of(now))
.capacity(Capacity.from(min, max))
.build();
- fixture.setScalingDuration(Duration.ofHours(6));
fixture.tester().clock().advance(Duration.ofDays(2));
fixture.loader().applyLoad(new Load(0.16, 0.02, 0.5), 120);
fixture.tester().assertResources("Scaling down memory",
- 7, 1, 2.5, 4.0, 80.2,
+ 6, 1, 2.1, 4.0, 96.2,
fixture.autoscale());
}
@@ -710,16 +709,16 @@ public class AutoscalingTest {
fixture.tester.clock().advance(timeAdded.negated());
fixture.loader().addCpuMeasurements(0.25, 200);
fixture.tester().assertResources("Scale up since we assume we need 2x cpu for growth when no scaling time data",
- 8, 1, 1.6, 7.4, 29.0,
+ 5, 1, 2.6, 12.3, 50.7,
fixture.autoscale());
fixture.setScalingDuration(Duration.ofHours(8));
fixture.tester().clock().advance(Duration.ofDays(2));
timeAdded = fixture.loader().addLoadMeasurements(100, t -> 100.0 + (t < 50 ? t : 100 - t), t -> 0.0);
fixture.tester.clock().advance(timeAdded.negated());
- fixture.loader().addCpuMeasurements(0.25, 200);
+ fixture.loader().addCpuMeasurements(0.20, 200);
fixture.tester().assertResources("Scale down since observed growth is slower than scaling time",
- 8, 1, 1.2, 7.4, 29.0,
+ 5, 1, 1.6, 12.3, 50.7,
fixture.autoscale());
fixture.setScalingDuration(Duration.ofHours(8));
@@ -730,7 +729,7 @@ public class AutoscalingTest {
fixture.tester.clock().advance(timeAdded.negated());
fixture.loader().addCpuMeasurements(0.25, 200);
fixture.tester().assertResources("Scale up since observed growth is faster than scaling time",
- 8, 1, 1.5, 7.4, 29.0,
+ 5, 1, 2.4, 12.3, 50.7,
fixture.autoscale());
}
@@ -747,7 +746,7 @@ public class AutoscalingTest {
fixture.tester.clock().advance(timeAdded.negated());
fixture.loader().addCpuMeasurements(0.7, 200);
fixture.tester().assertResources("Scale up slightly since observed growth is faster than scaling time, but we are not confident",
- 8, 1, 1.3, 7.4, 29.0,
+ 5, 1, 2.2, 12.3, 50.7,
fixture.autoscale());
}
@@ -766,16 +765,16 @@ public class AutoscalingTest {
fixture.tester.clock().advance(timeAdded.negated());
fixture.loader().addCpuMeasurements(0.4, 200);
fixture.tester.assertResources("Query and write load is equal -> scale up somewhat",
- 8, 1, 1.8, 7.4, 29.0,
+ 5, 1, 2.9, 12.3, 50.7,
fixture.autoscale());
fixture.tester().clock().advance(Duration.ofDays(2));
timeAdded = fixture.loader().addLoadMeasurements(100, t -> t == 0 ? 800.0 : 400.0, t -> 100.0);
fixture.tester.clock().advance(timeAdded.negated());
fixture.loader().addCpuMeasurements(0.4, 200);
- // TODO: Ackhually, we scale down here - why?
+ // TODO: Ackhually, we scale up less here - why?
fixture.tester().assertResources("Query load is 4x write load -> scale up more",
- 8, 1, 1.4, 7.4, 29.0,
+ 5, 1, 2.2, 12.3, 50.7,
fixture.autoscale());
fixture.tester().clock().advance(Duration.ofDays(2));
@@ -783,7 +782,7 @@ public class AutoscalingTest {
fixture.tester.clock().advance(timeAdded.negated());
fixture.loader().addCpuMeasurements(0.4, 200);
fixture.tester().assertResources("Write load is 10x query load -> scale down",
- 6, 1, 1.1, 10.0, 40.5,
+ 5, 1, 1.3, 12.3, 50.7,
fixture.autoscale());
fixture.tester().clock().advance(Duration.ofDays(2));
@@ -791,7 +790,7 @@ public class AutoscalingTest {
fixture.tester.clock().advance(timeAdded.negated());
fixture.loader().addCpuMeasurements(0.4, 200);
fixture.tester().assertResources("Query only -> larger",
- 8, 1, 2.1, 7.4, 29.0,
+ 5, 1, 3.5, 12.3, 50.7,
fixture.autoscale());
fixture.tester().clock().advance(Duration.ofDays(2));
@@ -954,4 +953,61 @@ public class AutoscalingTest {
.build();
}
+ @Test
+ public void change_not_requiring_node_replacement_is_preferred() {
+ var min = new ClusterResources(5, 1, new NodeResources( 16, 64, 200, 1, DiskSpeed.fast, StorageType.remote));
+ var max = new ClusterResources(6, 1, new NodeResources( 16, 64, 200, 1, DiskSpeed.fast, StorageType.remote));
+
+ List<Flavor> flavors = List.of(new Flavor("arm_16", new NodeResources( 16, 64, 200, 1, DiskSpeed.fast, StorageType.remote, NodeResources.Architecture.arm64)),
+ new Flavor("x86_16", new NodeResources( 16, 64, 200, 1, DiskSpeed.fast, StorageType.remote, NodeResources.Architecture.x86_64)));
+ var fixture = DynamicProvisioningTester.fixture()
+ .clusterType(ClusterSpec.Type.container)
+ .hostFlavors(flavors)
+ .awsZone(false, Environment.prod)
+ .capacity(Capacity.from(min, max))
+ .initialResources(Optional.of(min.with(min.nodeResources().with(NodeResources.Architecture.x86_64))))
+ .build();
+ var nodes = fixture.nodes().not().retired().asList();
+ assertEquals(5, nodes.size());
+ assertEquals(NodeResources.Architecture.x86_64, nodes.get(0).resources().architecture());
+
+ fixture.tester().clock().advance(Duration.ofHours(5));
+ fixture.loader().applyCpuLoad(0.27, 10); // trigger rescaling, but don't cause fulfilment < 1
+ var autoscaling = fixture.autoscale();
+ fixture.deploy(Capacity.from(autoscaling.resources().get()));
+ nodes = fixture.nodes().not().retired().asList();
+ assertEquals(6, nodes.size());
+ assertEquals("We stay with x86 even though the first matching flavor is arm",
+ NodeResources.Architecture.x86_64, nodes.get(0).resources().architecture());
+ }
+
+ // Verify that we choose not to increase to 3 nodes even though that is cheaper (dure to redundancy),
+ // due to considering the cost of redistribution. This depends quite finely on the parameters,
+ // and the easiest way to move it back if there is a change is to increase the scaling duration,
+ // as that is a redistribution cost multiplier (until redistribution is measured separately).
+ @Test
+ public void change_not_causing_redistribution_is_preferred() {
+ var min = new ClusterResources(2, 1, new NodeResources( 16, 64, 200, 1, DiskSpeed.fast, StorageType.remote));
+ var max = new ClusterResources(4, 1, new NodeResources( 32, 64, 200, 1, DiskSpeed.fast, StorageType.remote));
+
+ var fixture = DynamicProvisioningTester.fixture()
+ .clusterType(ClusterSpec.Type.content)
+ .awsSetup(true, Environment.prod)
+ .capacity(Capacity.from(min, max))
+ .initialResources(Optional.of(min))
+ .build();
+ fixture.setScalingDuration(Duration.ofMinutes(35));
+ var nodes = fixture.nodes().not().retired().asList();
+ assertEquals(2, nodes.size());
+ assertEquals(16.0, nodes.get(0).resources().vcpu(), 0.000001);
+
+ fixture.tester().clock().advance(Duration.ofHours(5));
+ fixture.loader().applyCpuLoad(0.75, 700); // trigger rescaling, but don't cause fulfilment < 1
+ var autoscaling = fixture.autoscale();
+ fixture.deploy(Capacity.from(autoscaling.resources().get()));
+ nodes = fixture.nodes().not().retired().asList();
+ assertEquals("Increasing cpu is preferred to adding nodes to avoid redistribution", 2, nodes.size());
+ assertEquals(28.5, nodes.get(0).resources().vcpu(), 0.000001);
+ }
+
}
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModelTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModelTest.java
index ec084014a6a..f07d52a4a7f 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModelTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModelTest.java
@@ -5,17 +5,12 @@ import com.yahoo.config.provision.ApplicationId;
import com.yahoo.config.provision.Capacity;
import com.yahoo.config.provision.ClusterResources;
import com.yahoo.config.provision.ClusterSpec;
-import com.yahoo.config.provision.NodeFlavors;
import com.yahoo.config.provision.NodeResources;
-import com.yahoo.config.provision.Zone;
import com.yahoo.test.ManualClock;
-import com.yahoo.vespa.curator.mock.MockCurator;
-import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.applications.Application;
import com.yahoo.vespa.hosted.provision.applications.Cluster;
import com.yahoo.vespa.hosted.provision.applications.Status;
import com.yahoo.vespa.hosted.provision.provisioning.ProvisioningTester;
-import com.yahoo.vespa.hosted.provision.testutils.MockNodeRepository;
import org.junit.Test;
import java.time.Duration;
@@ -36,10 +31,10 @@ public class ClusterModelTest {
public void unit_adjustment_should_cause_no_change() {
var model = clusterModelWithNoData(); // 5 nodes, 1 group
assertEquals(Load.one(), model.loadAdjustment());
- var target = model.loadAdjustment().scaled(resources());
+ var target = model.loadAdjustment().scaled(nodeResources());
int testingNodes = 5 - 1;
int currentNodes = 5 - 1;
- assertEquals(resources(), model.loadWith(testingNodes, 1).scaled(Load.one().divide(model.loadWith(currentNodes, 1)).scaled(target)));
+ assertEquals(nodeResources(), model.loadWith(testingNodes, 1).scaled(Load.one().divide(model.loadWith(currentNodes, 1)).scaled(target)));
}
@Test
@@ -91,16 +86,23 @@ public class ClusterModelTest {
ManualClock clock = new ManualClock();
Application application = Application.empty(ApplicationId.from("t1", "a1", "i1"));
ClusterSpec clusterSpec = clusterSpec();
- Cluster cluster = cluster(resources());
+ Cluster cluster = cluster();
application = application.with(cluster);
- return new ClusterModel(new ProvisioningTester.Builder().build().nodeRepository(),
+ var nodeRepository = new ProvisioningTester.Builder().build().nodeRepository();
+ return new ClusterModel(nodeRepository,
application.with(status),
- clusterSpec, cluster, clock, Duration.ofMinutes(10),
+ clusterSpec, cluster,
+ new AllocatableResources(clusterResources(), clusterSpec, nodeRepository),
+ clock, Duration.ofMinutes(10), Duration.ofMinutes(5),
timeseries(cluster,100, queryRate, writeRate, clock),
ClusterNodesTimeseries.empty());
}
- private NodeResources resources() {
+ private ClusterResources clusterResources() {
+ return new ClusterResources(5, 1, nodeResources());
+ }
+
+ private NodeResources nodeResources() {
return new NodeResources(1, 10, 100, 1);
}
@@ -111,10 +113,10 @@ public class ClusterModelTest {
.build();
}
- private Cluster cluster(NodeResources resources) {
+ private Cluster cluster() {
return Cluster.create(ClusterSpec.Id.from("test"),
false,
- Capacity.from(new ClusterResources(5, 1, resources)));
+ Capacity.from(clusterResources()));
}
/** Creates the given number of measurements, spaced 5 minutes between, using the given function */
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Fixture.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Fixture.java
index 33d3d3d50dc..78feba14fbf 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Fixture.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/Fixture.java
@@ -5,17 +5,14 @@ import com.yahoo.component.Version;
import com.yahoo.config.provision.ApplicationId;
import com.yahoo.config.provision.Capacity;
import com.yahoo.config.provision.Cloud;
-import com.yahoo.config.provision.ClusterInfo;
import com.yahoo.config.provision.ClusterResources;
import com.yahoo.config.provision.ClusterSpec;
import com.yahoo.config.provision.Environment;
import com.yahoo.config.provision.Flavor;
-import com.yahoo.config.provision.NodeFlavors;
import com.yahoo.config.provision.NodeResources;
import com.yahoo.config.provision.RegionName;
import com.yahoo.config.provision.SystemName;
import com.yahoo.config.provision.Zone;
-import com.yahoo.vespa.curator.mock.MockCurator;
import com.yahoo.vespa.flags.InMemoryFlagSource;
import com.yahoo.vespa.flags.PermanentFlags;
import com.yahoo.vespa.flags.custom.HostResources;
@@ -29,7 +26,6 @@ import com.yahoo.vespa.hosted.provision.autoscale.awsnodes.AwsHostResourcesCalcu
import com.yahoo.vespa.hosted.provision.autoscale.awsnodes.AwsNodeTypes;
import com.yahoo.vespa.hosted.provision.provisioning.DynamicProvisioningTester;
import com.yahoo.vespa.hosted.provision.provisioning.HostResourcesCalculator;
-import com.yahoo.vespa.hosted.provision.testutils.MockNodeRepository;
import java.time.Duration;
import java.util.Arrays;
@@ -72,9 +68,9 @@ public class Fixture {
return tester().nodeRepository().applications().get(applicationId).orElse(Application.empty(applicationId));
}
- public AllocatableClusterResources currentResources() {
- return new AllocatableClusterResources(tester.nodeRepository().nodes().list(Node.State.active).owner(applicationId).cluster(clusterId()),
- tester.nodeRepository());
+ public AllocatableResources currentResources() {
+ return new AllocatableResources(tester.nodeRepository().nodes().list(Node.State.active).owner(applicationId).cluster(clusterId()),
+ tester.nodeRepository());
}
public Cluster cluster() {
@@ -89,6 +85,7 @@ public class Fixture {
clusterSpec,
cluster(),
nodes(),
+ new AllocatableResources(nodes(), tester.nodeRepository()),
tester.nodeRepository().metricsDb(),
tester.nodeRepository().clock());
}
@@ -180,6 +177,7 @@ public class Fixture {
new NodeResources(100, 1000, 1000, 1, NodeResources.DiskSpeed.any)));
HostResourcesCalculator resourceCalculator = new DynamicProvisioningTester.MockHostResourcesCalculator(zone);
final InMemoryFlagSource flagSource = new InMemoryFlagSource();
+ boolean reversedFlavorOrder = false;
int hostCount = 0;
public Fixture.Builder zone(Zone zone) {
@@ -228,12 +226,16 @@ public class Fixture {
public Fixture.Builder awsSetup(boolean allowHostSharing, Environment environment) {
return this.awsHostFlavors()
.awsResourceCalculator()
- .zone(new Zone(Cloud.builder().dynamicProvisioning(true)
- .allowHostSharing(allowHostSharing)
- .build(),
- SystemName.Public,
- environment,
- RegionName.from("aws-eu-west-1a")));
+ .awsZone(allowHostSharing, environment);
+ }
+
+ public Fixture.Builder awsZone(boolean allowHostSharing, Environment environment) {
+ return zone(new Zone(Cloud.builder().dynamicProvisioning(true)
+ .allowHostSharing(allowHostSharing)
+ .build(),
+ SystemName.Public,
+ environment,
+ RegionName.from("aws-eu-west-1a")));
}
public Fixture.Builder vespaVersion(Version version) {
@@ -246,6 +248,11 @@ public class Fixture {
return this;
}
+ public Fixture.Builder hostFlavors(List<Flavor> hostFlavors) {
+ this.hostFlavors = hostFlavors;
+ return this;
+ }
+
/** Adds the host resources available on AWS. */
public Fixture.Builder awsHostFlavors() {
this.hostFlavors = AwsNodeTypes.asFlavors();