From c5d4a25cb7abf7fac55fc4588617495685cd592b Mon Sep 17 00:00:00 2001 From: Jon Bratseth Date: Fri, 30 Dec 2022 15:17:00 +0100 Subject: Store the load observed and inferred when making an autoscaling decision --- .../hosted/provision/autoscale/Autoscaler.java | 16 +++-- .../hosted/provision/autoscale/Autoscaling.java | 71 ++++++++++++++++------ .../hosted/provision/autoscale/ClusterModel.java | 12 ++-- .../persistence/ApplicationSerializer.java | 36 +++++++++-- .../hosted/provision/provisioning/Activator.java | 2 +- .../provision/testutils/MockNodeRepository.java | 17 ++++-- 6 files changed, 107 insertions(+), 47 deletions(-) (limited to 'node-repository/src/main/java') diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java index 7e429492de2..2fc3f32acfe 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java @@ -51,8 +51,6 @@ public class Autoscaler { * @return scaling advice for this cluster */ public Autoscaling autoscale(Application application, Cluster cluster, NodeList clusterNodes) { - if (cluster.minResources().equals(cluster.maxResources())) - return Autoscaling.dontScale(Autoscaling.Status.unavailable, "Autoscaling is not enabled", now()); return autoscale(application, cluster, clusterNodes, Limits.of(cluster)); } @@ -64,24 +62,26 @@ public class Autoscaler { clusterNodes, nodeRepository.metricsDb(), nodeRepository.clock()); + if (! limits.isEmpty() && cluster.minResources().equals(cluster.maxResources())) + return Autoscaling.dontScale(Autoscaling.Status.unavailable, "Autoscaling is not enabled", clusterModel); if ( ! clusterIsStable(clusterNodes, nodeRepository)) - return Autoscaling.dontScale(Status.waiting, "Cluster change in progress", now()); + return Autoscaling.dontScale(Status.waiting, "Cluster change in progress", clusterModel); var currentAllocation = new AllocatableClusterResources(clusterNodes, nodeRepository); Optional bestAllocation = allocationOptimizer.findBestAllocation(clusterModel.loadAdjustment(), currentAllocation, clusterModel, limits); if (bestAllocation.isEmpty()) - return Autoscaling.dontScale(Status.insufficient, "No allocations are possible within configured limits", now()); + return Autoscaling.dontScale(Status.insufficient, "No allocations are possible within configured limits", clusterModel); if (! worthRescaling(currentAllocation.realResources(), bestAllocation.get().realResources())) { if (bestAllocation.get().fulfilment() < 1) - return Autoscaling.dontScale(Status.insufficient, "Configured limits prevents better scaling of this cluster", now()); + return Autoscaling.dontScale(Status.insufficient, "Configured limits prevents better scaling of this cluster", clusterModel); else - return Autoscaling.dontScale(Status.ideal, "Cluster is ideally scaled", now()); + return Autoscaling.dontScale(Status.ideal, "Cluster is ideally scaled", clusterModel); } - return Autoscaling.scaleTo(bestAllocation.get().advertisedResources(), now()); + return Autoscaling.scaleTo(bestAllocation.get().advertisedResources(), clusterModel); } public static boolean clusterIsStable(NodeList clusterNodes, NodeRepository nodeRepository) { @@ -113,8 +113,6 @@ public class Autoscaler { return from < to && ! similar(from, to, resourceDifferenceWorthReallocation); } - private Instant now() { return nodeRepository.clock().instant(); } - private static boolean similar(double r1, double r2, double threshold) { return Math.abs(r1 - r2) / (( r1 + r2) / 2) < threshold; } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaling.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaling.java index 427a5d01531..579f9c2514f 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaling.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaling.java @@ -7,7 +7,7 @@ import java.util.Objects; import java.util.Optional; /** - * An autoscaling result. + * An autoscaling conclusion and the context that led to it. * * @author bratseth */ @@ -17,19 +17,20 @@ public class Autoscaling { private final String description; private final Optional resources; private final Instant at; + private final Load peak; + private final Load ideal; - public Autoscaling(Status status, String description, ClusterResources resources, Instant at) { - this(status, description, Optional.of(resources), at); - } - - public Autoscaling(Status status, String description, Optional resources, Instant at) { + public Autoscaling(Status status, String description, Optional resources, Instant at, + Load peak, Load ideal) { this.status = status; this.description = description; this.resources = resources; this.at = at; + this.peak = peak; + this.ideal = ideal; } - /** Returns the resource target of this, or empty if non target. */ + /** Returns the resource target of this, or empty if none (meaning keep the current allocation). */ public Optional resources() { return resources; } @@ -38,11 +39,27 @@ public class Autoscaling { public String description() { return description; } - /** Returns the time this target was decided. */ + /** Returns the time this was decided. */ public Instant at() { return at; } + /** Returns the peak load seen in the period considered in this. */ + public Load peak() { return peak; } + + /** Returns the ideal load the cluster in question should have. */ + public Load ideal() { return ideal; } + public Autoscaling with(Status status, String description) { - return new Autoscaling(status, description, resources, at); + return new Autoscaling(status, description, resources, at, peak, ideal); + } + + /** Converts this autoscaling into an ideal one at the completion of it. */ + public Autoscaling asIdeal(Instant at) { + return new Autoscaling(Status.ideal, + "Cluster is ideally scaled within configured limits", + Optional.empty(), + at, + peak, + ideal); } @Override @@ -52,35 +69,49 @@ public class Autoscaling { if ( ! this.description.equals(other.description)) return false; if ( ! this.resources.equals(other.resources)) return false; if ( ! this.at.equals(other.at)) return false; + if ( ! this.peak.equals(other.peak)) return false; + if ( ! this.ideal.equals(other.ideal)) return false; return true; } @Override public int hashCode() { - return Objects.hash(status, description, at); + return Objects.hash(status, description, at, peak, ideal); } @Override public String toString() { - return "autoscaling to " + resources + ", made at " + at; + return (resources.isPresent() ? "Autoscaling to " + resources : "Don't autoscale") + + (description.isEmpty() ? "" : ": " + description); } - public static Autoscaling empty() { return new Autoscaling(Status.unavailable, "", Optional.empty(), Instant.EPOCH); } - - public static Autoscaling dontScale(Status status, String description, Instant at) { - return new Autoscaling(status, description, Optional.empty(), at); + public static Autoscaling empty() { + return new Autoscaling(Status.unavailable, + "", + Optional.empty(), + Instant.EPOCH, + Load.zero(), + Load.zero()); } - public static Autoscaling ideal(Instant at) { - return new Autoscaling(Status.ideal, "Cluster is ideally scaled within configured limits", - Optional.empty(), at); + /** Creates an autoscaling conclusion which does not change the current allocation for a specified reason. */ + public static Autoscaling dontScale(Status status, String description, ClusterModel clusterModel) { + return new Autoscaling(status, + description, + Optional.empty(), + clusterModel.at(), + clusterModel.peakLoad(), + clusterModel.idealLoad()); } - public static Autoscaling scaleTo(ClusterResources target, Instant at) { + /** Creates an autoscaling conclusion to scale. */ + public static Autoscaling scaleTo(ClusterResources target, ClusterModel clusterModel) { return new Autoscaling(Status.rescaling, "Rescaling initiated due to load changes", Optional.of(target), - at); + clusterModel.at(), + clusterModel.peakLoad(), + clusterModel.idealLoad()); } public enum Status { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java index 1928a784763..03cefee7a63 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java @@ -53,6 +53,7 @@ public class ClusterModel { private final Duration scalingDuration; private final ClusterTimeseries clusterTimeseries; private final ClusterNodesTimeseries nodeTimeseries; + private final Instant at; // Lazily initialized members private Double queryFractionOfMax = null; @@ -75,6 +76,7 @@ public class ClusterModel { this.scalingDuration = computeScalingDuration(cluster, clusterSpec); this.clusterTimeseries = metricsDb.getClusterTimeseries(application.id(), cluster.id()); this.nodeTimeseries = new ClusterNodesTimeseries(scalingDuration(), cluster, nodes, metricsDb); + this.at = clock.instant(); } ClusterModel(Zone zone, @@ -95,6 +97,7 @@ public class ClusterModel { this.scalingDuration = scalingDuration; this.clusterTimeseries = clusterTimeseries; this.nodeTimeseries = nodeTimeseries; + this.at = clock.instant(); } public Application application() { return application; } @@ -151,12 +154,6 @@ public class ClusterModel { return averageQueryRate = clusterTimeseries().queryRate(scalingDuration(), clock); } - /** Returns the average of the last load measurement from each node. */ - public Load currentLoad() { return nodeTimeseries().currentLoad(); } - - /** Returns the average of all load measurements from all nodes*/ - public Load averageLoad() { return nodeTimeseries().averageLoad(); } - /** Returns the average of the peak load measurement in each dimension, from each node. */ public Load peakLoad() { return nodeTimeseries().peakLoad(); } @@ -239,6 +236,9 @@ public class ClusterModel { (1 - queryCpuFraction) * idealWriteCpuLoad; } + /** Returns the instant this model was created. */ + public Instant at() { return at;} + /** Returns the headroom for growth during organic traffic growth as a multiple of current resources. */ private double growthRateHeadroom() { if ( ! zone.environment().isProduction()) return 1; diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/ApplicationSerializer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/ApplicationSerializer.java index 3733eb69e2e..2f5b057e927 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/ApplicationSerializer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/persistence/ApplicationSerializer.java @@ -14,6 +14,7 @@ import com.yahoo.vespa.hosted.provision.applications.Cluster; import com.yahoo.vespa.hosted.provision.applications.ScalingEvent; import com.yahoo.vespa.hosted.provision.applications.Status; import com.yahoo.vespa.hosted.provision.autoscale.Autoscaling; +import com.yahoo.vespa.hosted.provision.autoscale.Load; import java.io.IOException; import java.io.UncheckedIOException; @@ -57,6 +58,11 @@ public class ApplicationSerializer { private static final String scalingEventsKey = "scalingEvents"; private static final String autoscalingStatusObjectKey = "autoscalingStatusObject"; private static final String descriptionKey = "description"; + private static final String peakKey = "peak"; + private static final String idealKey = "ideal"; + private static final String cpuKey = "cpu"; + private static final String memoryKey = "memory"; + private static final String diskKey = "disk"; private static final String fromKey = "from"; private static final String toKey = "to"; private static final String generationKey = "generation"; @@ -134,10 +140,12 @@ public class ApplicationSerializer { } private static void toSlime(Autoscaling autoscaling, Cursor autoscalingObject) { - autoscaling.resources().ifPresent(resources -> toSlime(resources, autoscalingObject.setObject(resourcesKey))); - autoscalingObject.setLong(atKey, autoscaling.at().toEpochMilli()); autoscalingObject.setString(statusKey, toAutoscalingStatusCode(autoscaling.status())); autoscalingObject.setString(descriptionKey, autoscaling.description()); + autoscaling.resources().ifPresent(resources -> toSlime(resources, autoscalingObject.setObject(resourcesKey))); + autoscalingObject.setLong(atKey, autoscaling.at().toEpochMilli()); + toSlime(autoscaling.peak(), autoscalingObject.setObject(peakKey)); + toSlime(autoscaling.ideal(), autoscalingObject.setObject(idealKey)); } private static void toSlime(ClusterResources resources, Cursor clusterResourcesObject) { @@ -157,6 +165,18 @@ public class ApplicationSerializer { NodeResourcesSerializer.resourcesFromSlime(clusterResourcesObject.field(nodeResourcesKey))); } + private static void toSlime(Load load, Cursor loadObject) { + loadObject.setDouble(cpuKey, load.cpu()); + loadObject.setDouble(memoryKey, load.memory()); + loadObject.setDouble(diskKey, load.disk()); + } + + private static Load loadFromSlime(Inspector loadObject) { + return new Load(loadObject.field(cpuKey).asDouble(), + loadObject.field(memoryKey).asDouble(), + loadObject.field(diskKey).asDouble()); + } + private static Autoscaling autoscalingFromSlime(Inspector autoscalingObject, Inspector legacyAutoscalingStatusObject) { if ( ! autoscalingObject.valid()) return Autoscaling.empty(); @@ -165,20 +185,26 @@ public class ApplicationSerializer { return new Autoscaling(fromAutoscalingStatusCode(legacyAutoscalingStatusObject.field(statusKey).asString()), legacyAutoscalingStatusObject.field(descriptionKey).asString(), optionalClusterResourcesFromSlime(autoscalingObject), - Instant.EPOCH); + Instant.EPOCH, + Load.zero(), + Load.zero()); } if (legacyAutoscalingStatusObject.valid()) { // TODO: Remove after January 2023 return new Autoscaling(fromAutoscalingStatusCode(legacyAutoscalingStatusObject.field(statusKey).asString()), legacyAutoscalingStatusObject.field(descriptionKey).asString(), optionalClusterResourcesFromSlime(autoscalingObject.field(resourcesKey)), - Instant.ofEpochMilli(autoscalingObject.field(atKey).asLong())); + Instant.ofEpochMilli(autoscalingObject.field(atKey).asLong()), + loadFromSlime(autoscalingObject.field(peakKey)), + loadFromSlime(autoscalingObject.field(idealKey))); } return new Autoscaling(fromAutoscalingStatusCode(autoscalingObject.field(statusKey).asString()), autoscalingObject.field(descriptionKey).asString(), optionalClusterResourcesFromSlime(autoscalingObject.field(resourcesKey)), - Instant.ofEpochMilli(autoscalingObject.field(atKey).asLong())); + Instant.ofEpochMilli(autoscalingObject.field(atKey).asLong()), + loadFromSlime(autoscalingObject.field(peakKey)), + loadFromSlime(autoscalingObject.field(idealKey))); } private static void scalingEventsToSlime(List scalingEvents, Cursor eventArray) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Activator.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Activator.java index 788dabc0949..caf936e8aeb 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Activator.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/provisioning/Activator.java @@ -120,7 +120,7 @@ class Activator { } if (cluster.target().resources().isPresent() && cluster.target().resources().get().justNumbers().equals(currentResources.justNumbers())) { - cluster = cluster.withTarget(Autoscaling.ideal(nodeRepository.clock().instant())); + cluster = cluster.withTarget(cluster.target().asIdeal(nodeRepository.clock().instant())); } if (cluster != modified.cluster(clusterEntry.getKey()).get()) modified = modified.with(cluster); diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockNodeRepository.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockNodeRepository.java index e53a67bd5d4..91c8f803429 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockNodeRepository.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/testutils/MockNodeRepository.java @@ -30,6 +30,7 @@ import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.applications.Application; import com.yahoo.vespa.hosted.provision.applications.Cluster; import com.yahoo.vespa.hosted.provision.autoscale.Autoscaling; +import com.yahoo.vespa.hosted.provision.autoscale.Load; import com.yahoo.vespa.hosted.provision.autoscale.MemoryMetricsDb; import com.yahoo.vespa.hosted.provision.node.Agent; import com.yahoo.vespa.hosted.provision.node.IP; @@ -198,14 +199,18 @@ public class MockNodeRepository extends NodeRepository { Cluster cluster1 = app1.cluster(cluster1Id.id()).get(); cluster1 = cluster1.withSuggested(new Autoscaling(Autoscaling.Status.unavailable, "", - new ClusterResources(6, 2, - new NodeResources(3, 20, 100, 1)), - clock().instant())); + Optional.of(new ClusterResources(6, 2, + new NodeResources(3, 20, 100, 1))), + clock().instant(), + Load.zero(), + Load.zero())); cluster1 = cluster1.withTarget(new Autoscaling(Autoscaling.Status.unavailable, "", - new ClusterResources(4, 1, - new NodeResources(3, 16, 100, 1)), - clock().instant())); + Optional.of(new ClusterResources(4, 1, + new NodeResources(3, 16, 100, 1))), + clock().instant(), + Load.zero(), + Load.zero())); try (Mutex lock = applications().lock(app1Id)) { applications().put(app1.with(cluster1), lock); } -- cgit v1.2.3