diff options
author | Valerij Fredriksen <freva@users.noreply.github.com> | 2021-03-18 13:27:14 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-03-18 13:27:14 +0100 |
commit | 7c5948aceceec2a3c279755622876932fdf5a3ed (patch) | |
tree | 1801a9db90991b40f6f2ff27f609d02f049e3ce0 /node-repository | |
parent | 2e0ad58bb9bbb5cc24289acff25046a5b9442e2b (diff) | |
parent | 95a73e1587180eee3a272aed8a02e865943666cc (diff) |
Merge pull request #17030 from vespa-engine/bratseth/cluster-model
Bratseth/cluster model
Diffstat (limited to 'node-repository')
13 files changed, 259 insertions, 205 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/Cluster.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/Cluster.java index 69d7cec4007..59b70ff1ef0 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/Cluster.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/Cluster.java @@ -129,32 +129,6 @@ public class Cluster { return new Cluster(id, exclusive, min, max, suggested, target, scalingEvents, autoscalingStatus); } - /** The predicted duration of a rescaling of this cluster */ - public Duration scalingDuration(ClusterSpec clusterSpec) { - int completedEventCount = 0; - Duration totalDuration = Duration.ZERO; - for (ScalingEvent event : scalingEvents()) { - if (event.duration().isEmpty()) continue; - completedEventCount++; - totalDuration = totalDuration.plus(event.duration().get()); - } - - if (completedEventCount == 0) { // Use defaults - if (clusterSpec.isStateful()) return Duration.ofHours(12); - return Duration.ofMinutes(10); - } - else { - Duration predictedDuration = totalDuration.dividedBy(completedEventCount); - - // TODO: Remove when we have reliable completion for content clusters - if (clusterSpec.isStateful() && predictedDuration.minus(Duration.ofHours(12)).isNegative()) - return Duration.ofHours(12); - - if (predictedDuration.minus(Duration.ofMinutes(5)).isNegative()) return Duration.ofMinutes(5); // minimum - return predictedDuration; - } - } - @Override public int hashCode() { return id.hashCode(); } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java index ac3430fecf9..9791aabf7b4 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java @@ -59,35 +59,25 @@ public class Autoscaler { } private Advice autoscale(Application application, Cluster cluster, NodeList clusterNodes, Limits limits) { - if ( ! stable(clusterNodes, nodeRepository)) - return Advice.none("Cluster change in progress"); + ClusterModel clusterModel = new ClusterModel(application, cluster, clusterNodes, metricsDb, nodeRepository.clock()); - Duration scalingWindow = cluster.scalingDuration(clusterNodes.clusterSpec()); - if (scaledIn(scalingWindow, cluster)) - return Advice.dontScale("Won't autoscale now: Less than " + scalingWindow + " since last resource change"); + if ( ! clusterIsStable(clusterNodes, nodeRepository)) + return Advice.none("Cluster change in progress"); - var clusterNodesTimeseries = new ClusterNodesTimeseries(scalingWindow, cluster, clusterNodes, metricsDb); - var currentAllocation = new AllocatableClusterResources(clusterNodes.asList(), nodeRepository, cluster.exclusive()); + if (scaledIn(clusterModel.scalingDuration(), cluster)) + return Advice.dontScale("Won't autoscale now: Less than " + clusterModel.scalingDuration() + " since last resource change"); - int measurementsPerNode = clusterNodesTimeseries.measurementsPerNode(); - if (measurementsPerNode < minimumMeasurementsPerNode(scalingWindow)) + if (clusterModel.nodeTimeseries().measurementsPerNode() < minimumMeasurementsPerNode(clusterModel.scalingDuration())) return Advice.none("Collecting more data before making new scaling decisions: Need to measure for " + - scalingWindow + " since the last resource change completed"); + clusterModel.scalingDuration() + " since the last resource change completed"); - int nodesMeasured = clusterNodesTimeseries.nodesMeasured(); - if (nodesMeasured != clusterNodes.size()) + if (clusterModel.nodeTimeseries().nodesMeasured() != clusterNodes.size()) return Advice.none("Collecting more data before making new scaling decisions: " + - "Have measurements from " + nodesMeasured + " nodes, but require from " + clusterNodes.size()); + "Have measurements from " + clusterModel.nodeTimeseries().nodesMeasured() + + " nodes, but require from " + clusterNodes.size()); - - var scalingDuration = cluster.scalingDuration(clusterNodes.clusterSpec()); - var clusterTimeseries = metricsDb.getClusterTimeseries(application.id(), cluster.id()); - var target = ResourceTarget.idealLoad(scalingDuration, - clusterTimeseries, - clusterNodesTimeseries, - currentAllocation, - application, - nodeRepository.clock()); + var currentAllocation = new AllocatableClusterResources(clusterNodes.asList(), nodeRepository, cluster.exclusive()); + var target = ResourceTarget.idealLoad(clusterModel, currentAllocation); Optional<AllocatableClusterResources> bestAllocation = allocationOptimizer.findBestAllocation(target, currentAllocation, limits); @@ -97,13 +87,27 @@ public class Autoscaler { if (similar(bestAllocation.get().realResources(), currentAllocation.realResources())) return Advice.dontScale("Cluster is ideally scaled within configured limits"); - if (isDownscaling(bestAllocation.get(), currentAllocation) && scaledIn(scalingWindow.multipliedBy(3), cluster)) - return Advice.dontScale("Waiting " + scalingWindow.multipliedBy(3) + + if (isDownscaling(bestAllocation.get(), currentAllocation) && scaledIn(clusterModel.scalingDuration().multipliedBy(3), cluster)) + return Advice.dontScale("Waiting " + clusterModel.scalingDuration().multipliedBy(3) + " since the last change before reducing resources"); return Advice.scaleTo(bestAllocation.get().advertisedResources()); } + public static boolean clusterIsStable(NodeList clusterNodes, NodeRepository nodeRepository) { + // The cluster is processing recent changes + if (clusterNodes.stream().anyMatch(node -> node.status().wantToRetire() || + node.allocation().get().membership().retired() || + node.allocation().get().isRemovable())) + return false; + + // A deployment is ongoing + if (nodeRepository.nodes().list(Node.State.reserved).owner(clusterNodes.first().get().allocation().get().owner()).size() > 0) + return false; + + return true; + } + /** Returns true if both total real resources and total cost are similar */ public static boolean similar(ClusterResources a, ClusterResources b) { return similar(a.cost(), b.cost(), costDifferenceWorthReallocation) && @@ -143,20 +147,6 @@ public class Autoscaler { return (int)minimumMeasurements; } - public static boolean stable(NodeList nodes, NodeRepository nodeRepository) { - // The cluster is processing recent changes - if (nodes.stream().anyMatch(node -> node.status().wantToRetire() || - node.allocation().get().membership().retired() || - node.allocation().get().isRemovable())) - return false; - - // A deployment is ongoing - if (nodeRepository.nodes().list(Node.State.reserved).owner(nodes.first().get().allocation().get().owner()).size() > 0) - return false; - - return true; - } - public static class Advice { private final boolean present; diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java new file mode 100644 index 00000000000..4fb91e8592e --- /dev/null +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java @@ -0,0 +1,173 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.hosted.provision.autoscale; + +import com.yahoo.vespa.hosted.provision.NodeList; +import com.yahoo.vespa.hosted.provision.applications.Application; +import com.yahoo.vespa.hosted.provision.applications.Cluster; +import com.yahoo.vespa.hosted.provision.applications.ScalingEvent; + +import java.time.Clock; +import java.time.Duration; +import java.util.OptionalDouble; + +/** + * A cluster with its associated metrics which allows prediction about its future behavior. + * For single-threaded, short-term usage. + * + * @author bratseth + */ +public class ClusterModel { + + static final double idealQueryCpuLoad = 0.8; + static final double idealWriteCpuLoad = 0.95; + static final double idealMemoryLoad = 0.7; + static final double idealDiskLoad = 0.6; + + private final Application application; + private final Cluster cluster; + private final NodeList nodes; + private final MetricsDb metricsDb; + private final Clock clock; + private final Duration scalingDuration; + + // Lazily initialized members + private Double queryFractionOfMax = null; + private Double maxQueryGrowthRate = null; + private ClusterNodesTimeseries nodeTimeseries = null; + private ClusterTimeseries clusterTimeseries = null; + + public ClusterModel(Application application, + Cluster cluster, + NodeList clusterNodes, + MetricsDb metricsDb, + Clock clock) { + this.application = application; + this.cluster = cluster; + this.nodes = clusterNodes; + this.metricsDb = metricsDb; + this.clock = clock; + this.scalingDuration = computeScalingDuration(cluster, clusterNodes); + } + + /** For testing */ + ClusterModel(Application application, + Cluster cluster, + Clock clock, + Duration scalingDuration, + ClusterTimeseries clusterTimeseries) { + this.application = application; + this.cluster = cluster; + this.nodes = null; + this.metricsDb = null; + this.clock = clock; + + this.scalingDuration = scalingDuration; + this.clusterTimeseries = clusterTimeseries; + } + + /** Returns the predicted duration of a rescaling of this cluster */ + public Duration scalingDuration() { return scalingDuration; } + + public ClusterNodesTimeseries nodeTimeseries() { + if (nodeTimeseries != null) return nodeTimeseries; + return nodeTimeseries = new ClusterNodesTimeseries(scalingDuration(), cluster, nodes, metricsDb); + } + + public ClusterTimeseries clusterTimeseries() { + if (clusterTimeseries != null) return clusterTimeseries; + return clusterTimeseries = metricsDb.getClusterTimeseries(application.id(), cluster.id()); + } + + /** + * Returns the predicted max query growth rate per minute as a fraction of the average traffic + * in the scaling window + */ + public double maxQueryGrowthRate() { + if (maxQueryGrowthRate != null) return maxQueryGrowthRate; + return maxQueryGrowthRate = clusterTimeseries().maxQueryGrowthRate(scalingDuration(), clock); + } + + /** Returns the average query rate in the scaling window as a fraction of the max observed query rate */ + public double queryFractionOfMax() { + if (queryFractionOfMax != null) return queryFractionOfMax; + return queryFractionOfMax = clusterTimeseries().queryFractionOfMax(scalingDuration(), clock); + } + + public double averageLoad(Resource resource) { return nodeTimeseries().averageLoad(resource); } + + public double idealLoad(Resource resource) { + switch (resource) { + case cpu : return idealCpuLoad(); + case memory : return idealMemoryLoad; + case disk : return idealDiskLoad; + default : throw new IllegalStateException("No ideal load defined for " + resource); + } + } + + /** Ideal cpu load must take the application traffic fraction into account */ + private double idealCpuLoad() { + double queryCpuFraction = queryCpuFraction(); + + // What's needed to have headroom for growth during scale-up as a fraction of current resources? + double growthRateHeadroom = 1 + maxQueryGrowthRate() * scalingDuration().toMinutes(); + // Cap headroom at 10% above the historical observed peak + if (queryFractionOfMax() != 0) + growthRateHeadroom = Math.min(growthRateHeadroom, 1 / queryFractionOfMax() + 0.1); + + // How much headroom is needed to handle sudden arrival of additional traffic due to another zone going down? + double maxTrafficShiftHeadroom = 10.0; // Cap to avoid extreme sizes from a current very small share + double trafficShiftHeadroom; + if (application.status().maxReadShare() == 0) // No traffic fraction data + trafficShiftHeadroom = 2.0; // assume we currently get half of the global share of traffic + else if (application.status().currentReadShare() == 0) + trafficShiftHeadroom = maxTrafficShiftHeadroom; + else + trafficShiftHeadroom = application.status().maxReadShare() / application.status().currentReadShare(); + trafficShiftHeadroom = Math.min(trafficShiftHeadroom, maxTrafficShiftHeadroom); + + // Assumptions: 1) Write load is not organic so we should not grow to handle more. + // (TODO: But allow applications to set their target write rate and size for that) + // 2) Write load does not change in BCP scenarios. + return queryCpuFraction * 1 / growthRateHeadroom * 1 / trafficShiftHeadroom * idealQueryCpuLoad + + (1 - queryCpuFraction) * idealWriteCpuLoad; + } + + private double queryCpuFraction() { + OptionalDouble queryRate = clusterTimeseries().queryRate(scalingDuration(), clock); + OptionalDouble writeRate = clusterTimeseries().writeRate(scalingDuration(), clock); + if (queryRate.orElse(0) == 0 && writeRate.orElse(0) == 0) return queryCpuFraction(0.5); + return queryCpuFraction(queryRate.orElse(0) / (queryRate.orElse(0) + writeRate.orElse(0))); + } + + private double queryCpuFraction(double queryFraction) { + double relativeQueryCost = 9; // How much more expensive are queries than writes? TODO: Measure + double writeFraction = 1 - queryFraction; + return queryFraction * relativeQueryCost / (queryFraction * relativeQueryCost + writeFraction); + } + + private static Duration computeScalingDuration(Cluster cluster, NodeList nodes) { + int completedEventCount = 0; + Duration totalDuration = Duration.ZERO; + for (ScalingEvent event : cluster.scalingEvents()) { + if (event.duration().isEmpty()) continue; + completedEventCount++; + totalDuration = totalDuration.plus(event.duration().get()); + } + + if (completedEventCount == 0) { // Use defaults + if (nodes.clusterSpec().isStateful()) return Duration.ofHours(12); + return Duration.ofMinutes(10); + } + else { + Duration predictedDuration = totalDuration.dividedBy(completedEventCount); + + // TODO: Remove when we have reliable completion for content clusters + if (nodes.clusterSpec().isStateful() && predictedDuration.minus(Duration.ofHours(12)).isNegative()) + return Duration.ofHours(12); + + if (predictedDuration.minus(Duration.ofMinutes(5)).isNegative()) return Duration.ofMinutes(5); // minimum + return predictedDuration; + } + } + +} diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterNodesTimeseries.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterNodesTimeseries.java index 2d0e77742ec..c097abd8208 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterNodesTimeseries.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterNodesTimeseries.java @@ -40,9 +40,7 @@ public class ClusterNodesTimeseries { } /** Returns the number of nodes measured in this */ - public int nodesMeasured() { - return timeseries.size(); - } + public int nodesMeasured() { return timeseries.size(); } /** Returns the average load of this resource in this */ public double averageLoad(Resource resource) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java index e12e5442c52..79e2d3b3398 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java @@ -22,8 +22,6 @@ public class ClusterTimeseries { private final ClusterSpec.Id cluster; private final List<ClusterMetricSnapshot> snapshots; - private Double cachedMaxQueryGrowthRate = null; - ClusterTimeseries(ClusterSpec.Id cluster, List<ClusterMetricSnapshot> snapshots) { this.cluster = cluster; List<ClusterMetricSnapshot> sortedSnapshots = new ArrayList<>(snapshots); @@ -51,12 +49,6 @@ public class ClusterTimeseries { * The max query growth rate we can predict from this time-series as a fraction of the average traffic in the window */ public double maxQueryGrowthRate(Duration window, Clock clock) { - if (cachedMaxQueryGrowthRate != null) - return cachedMaxQueryGrowthRate; - return cachedMaxQueryGrowthRate = computeMaxQueryGrowthRate(window, clock); - } - - private double computeMaxQueryGrowthRate(Duration window, Clock clock) { if (snapshots.isEmpty()) return 0.1; // Find the period having the highest growth rate, where total growth exceeds 30% increase double maxGrowthRate = 0; // In query rate per minute diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java index c8de94eeab8..b3cf6c1e962 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java @@ -103,7 +103,7 @@ public class MetricsResponse { private boolean clusterIsStable(Node node, NodeList applicationNodes, NodeRepository nodeRepository) { ClusterSpec cluster = node.allocation().get().membership().cluster(); - return Autoscaler.stable(applicationNodes.cluster(cluster.id()), nodeRepository); + return Autoscaler.clusterIsStable(applicationNodes.cluster(cluster.id()), nodeRepository); } public static MetricsResponse empty() { return new MetricsResponse(List.of()); } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Resource.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Resource.java index b841b31833f..c639ad1f779 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Resource.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Resource.java @@ -12,25 +12,19 @@ public enum Resource { /** Cpu utilization ratio */ cpu { - public double idealAverageLoad() { return 0.8; } double valueFrom(NodeResources resources) { return resources.vcpu(); } }, /** Memory utilization ratio */ memory { - public double idealAverageLoad() { return 0.7; } double valueFrom(NodeResources resources) { return resources.memoryGb(); } }, /** Disk utilization ratio */ disk { - public double idealAverageLoad() { return 0.6; } double valueFrom(NodeResources resources) { return resources.diskGb(); } }; - /** The load we should have of this resource on average, when one node in the cluster is down */ - public abstract double idealAverageLoad(); - abstract double valueFrom(NodeResources resources); } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTarget.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTarget.java index 9f6a4fc77cd..b1a1e86b08d 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTarget.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTarget.java @@ -51,18 +51,14 @@ public class ResourceTarget { } /** Create a target of achieving ideal load given a current load */ - public static ResourceTarget idealLoad(Duration scalingDuration, - ClusterTimeseries clusterTimeseries, - ClusterNodesTimeseries clusterNodesTimeseries, - AllocatableClusterResources current, - Application application, - Clock clock) { - return new ResourceTarget(nodeUsage(Resource.cpu, clusterNodesTimeseries.averageLoad(Resource.cpu), current) - / idealCpuLoad(scalingDuration, clusterTimeseries, application, clock), - nodeUsage(Resource.memory, clusterNodesTimeseries.averageLoad(Resource.memory), current) - / Resource.memory.idealAverageLoad(), - nodeUsage(Resource.disk, clusterNodesTimeseries.averageLoad(Resource.disk), current) - / Resource.disk.idealAverageLoad(), + public static ResourceTarget idealLoad(ClusterModel clusterModel, + AllocatableClusterResources current) { + return new ResourceTarget(nodeUsage(Resource.cpu, clusterModel.averageLoad(Resource.cpu), current) + / clusterModel.idealLoad(Resource.cpu), + nodeUsage(Resource.memory, clusterModel.averageLoad(Resource.memory), current) + / clusterModel.idealLoad(Resource.memory), + nodeUsage(Resource.disk, clusterModel.averageLoad(Resource.disk), current) + / clusterModel.idealLoad(Resource.disk), true); } @@ -74,58 +70,4 @@ public class ResourceTarget { false); } - /** Ideal cpu load must take the application traffic fraction into account */ - public static double idealCpuLoad(Duration scalingDuration, - ClusterTimeseries clusterTimeseries, - Application application, - Clock clock) { - double queryCpuFraction = queryCpuFraction(clusterTimeseries, scalingDuration, clock); - - // What's needed to have headroom for growth during scale-up as a fraction of current resources? - double maxGrowthRate = clusterTimeseries.maxQueryGrowthRate(scalingDuration, clock); // in fraction per minute of the current traffic - double growthRateHeadroom = 1 + maxGrowthRate * scalingDuration.toMinutes(); - // Cap headroom at 10% above the historical observed peak - double fractionOfMax = clusterTimeseries.queryFractionOfMax(scalingDuration, clock); - if (fractionOfMax != 0) - growthRateHeadroom = Math.min(growthRateHeadroom, 1 / fractionOfMax + 0.1); - - // How much headroom is needed to handle sudden arrival of additional traffic due to another zone going down? - double maxTrafficShiftHeadroom = 10.0; // Cap to avoid extreme sizes from a current very small share - double trafficShiftHeadroom; - if (application.status().maxReadShare() == 0) // No traffic fraction data - trafficShiftHeadroom = 2.0; // assume we currently get half of the global share of traffic - else if (application.status().currentReadShare() == 0) - trafficShiftHeadroom = maxTrafficShiftHeadroom; - else - trafficShiftHeadroom = application.status().maxReadShare() / application.status().currentReadShare(); - trafficShiftHeadroom = Math.min(trafficShiftHeadroom, maxTrafficShiftHeadroom); - - // Assumptions: 1) Write load is not organic so we should not grow to handle more. - // (TODO: But allow applications to set their target write rate and size for that) - // 2) Write load does not change in BCP scenarios. - return queryCpuFraction * 1 / growthRateHeadroom * 1 / trafficShiftHeadroom * idealQueryCpuLoad() + - (1 - queryCpuFraction) * idealWriteCpuLoad(); - } - - private static double queryCpuFraction(ClusterTimeseries clusterTimeseries, Duration scalingDuration, Clock clock) { - OptionalDouble queryRate = clusterTimeseries.queryRate(scalingDuration, clock); - OptionalDouble writeRate = clusterTimeseries.writeRate(scalingDuration, clock); - if (queryRate.orElse(0) == 0 && writeRate.orElse(0) == 0) return queryCpuFraction(0.5); - return queryCpuFraction(queryRate.orElse(0) / (queryRate.orElse(0) + writeRate.orElse(0))); - } - - private static double queryCpuFraction(double queryFraction) { - double relativeQueryCost = 9; // How much more expensive are queries than writes? TODO: Measure - double writeFraction = 1 - queryFraction; - return queryFraction * relativeQueryCost / (queryFraction * relativeQueryCost + writeFraction); - } - - public static double idealQueryCpuLoad() { return Resource.cpu.idealAverageLoad(); } - - public static double idealWriteCpuLoad() { return 0.95; } - - public static double idealMemoryLoad() { return Resource.memory.idealAverageLoad(); } - - public static double idealDiskLoad() { return Resource.disk.idealAverageLoad(); } - } diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/ApplicationSerializer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/ApplicationSerializer.java index 8d8d7e01049..cc59860384b 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/ApplicationSerializer.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/ApplicationSerializer.java @@ -7,9 +7,11 @@ import com.yahoo.slime.Cursor; import com.yahoo.slime.Slime; import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeList; +import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.applications.Application; import com.yahoo.vespa.hosted.provision.applications.Cluster; import com.yahoo.vespa.hosted.provision.applications.ScalingEvent; +import com.yahoo.vespa.hosted.provision.autoscale.ClusterModel; import com.yahoo.vespa.hosted.provision.autoscale.ClusterNodesTimeseries; import com.yahoo.vespa.hosted.provision.autoscale.ClusterTimeseries; import com.yahoo.vespa.hosted.provision.autoscale.MetricsDb; @@ -29,40 +31,45 @@ import java.util.List; */ public class ApplicationSerializer { - public static Slime toSlime(Application application, NodeList applicationNodes, MetricsDb metricsDb, URI applicationUri) { + public static Slime toSlime(Application application, + NodeList applicationNodes, + MetricsDb metricsDb, + NodeRepository nodeRepository, + URI applicationUri) { Slime slime = new Slime(); - toSlime(application, applicationNodes, metricsDb, slime.setObject(), applicationUri); + toSlime(application, applicationNodes, metricsDb, nodeRepository, slime.setObject(), applicationUri); return slime; } private static void toSlime(Application application, NodeList applicationNodes, MetricsDb metricsDb, + NodeRepository nodeRepository, Cursor object, URI applicationUri) { object.setString("url", applicationUri.toString()); object.setString("id", application.id().toFullString()); - clustersToSlime(application, applicationNodes, metricsDb, object.setObject("clusters")); + clustersToSlime(application, applicationNodes, metricsDb, nodeRepository, object.setObject("clusters")); } private static void clustersToSlime(Application application, NodeList applicationNodes, MetricsDb metricsDb, + NodeRepository nodeRepository, Cursor clustersObject) { - application.clusters().values().forEach(cluster -> toSlime(application, cluster, applicationNodes, metricsDb, clustersObject)); + application.clusters().values().forEach(cluster -> toSlime(application, cluster, applicationNodes, metricsDb, nodeRepository, clustersObject)); } private static void toSlime(Application application, Cluster cluster, NodeList applicationNodes, MetricsDb metricsDb, + NodeRepository nodeRepository, Cursor clustersObject) { NodeList nodes = applicationNodes.not().retired().cluster(cluster.id()); if (nodes.isEmpty()) return; ClusterResources currentResources = nodes.toResources(); - Duration scalingDuration = cluster.scalingDuration(nodes.clusterSpec()); - var clusterNodesTimeseries = new ClusterNodesTimeseries(Duration.ofHours(1), cluster, nodes, metricsDb); - var clusterTimeseries = metricsDb.getClusterTimeseries(application.id(), cluster.id()); + ClusterModel clusterModel = new ClusterModel(application, cluster, nodes, metricsDb, nodeRepository.clock()); Cursor clusterObject = clustersObject.setObject(cluster.id().value()); clusterObject.setString("type", nodes.clusterSpec().type().name()); @@ -72,12 +79,12 @@ public class ApplicationSerializer { if (cluster.shouldSuggestResources(currentResources)) cluster.suggestedResources().ifPresent(suggested -> toSlime(suggested.resources(), clusterObject.setObject("suggested"))); cluster.targetResources().ifPresent(target -> toSlime(target, clusterObject.setObject("target"))); - clusterUtilizationToSlime(application, scalingDuration, clusterTimeseries, clusterNodesTimeseries, metricsDb.clock(), clusterObject.setObject("utilization")); + clusterUtilizationToSlime(clusterModel, clusterObject.setObject("utilization")); scalingEventsToSlime(cluster.scalingEvents(), clusterObject.setArray("scalingEvents")); clusterObject.setString("autoscalingStatus", cluster.autoscalingStatus()); - clusterObject.setLong("scalingDuration", scalingDuration.toMillis()); - clusterObject.setDouble("maxQueryGrowthRate", clusterTimeseries.maxQueryGrowthRate(scalingDuration, metricsDb.clock())); - clusterObject.setDouble("currentQueryFractionOfMax", clusterTimeseries.queryFractionOfMax(scalingDuration, metricsDb.clock())); + clusterObject.setLong("scalingDuration", clusterModel.scalingDuration().toMillis()); + clusterObject.setDouble("maxQueryGrowthRate", clusterModel.maxQueryGrowthRate()); + clusterObject.setDouble("currentQueryFractionOfMax", clusterModel.queryFractionOfMax()); } private static void toSlime(ClusterResources resources, Cursor clusterResourcesObject) { @@ -86,18 +93,13 @@ public class ApplicationSerializer { NodeResourcesSerializer.toSlime(resources.nodeResources(), clusterResourcesObject.setObject("resources")); } - private static void clusterUtilizationToSlime(Application application, - Duration scalingDuration, - ClusterTimeseries clusterTimeseries, - ClusterNodesTimeseries clusterNodesTimeseries, - Clock clock, - Cursor utilizationObject) { - utilizationObject.setDouble("cpu", clusterNodesTimeseries.averageLoad(Resource.cpu)); - utilizationObject.setDouble("idealCpu", ResourceTarget.idealCpuLoad(scalingDuration, clusterTimeseries, application, clock)); - utilizationObject.setDouble("memory", clusterNodesTimeseries.averageLoad(Resource.memory)); - utilizationObject.setDouble("idealMemory", ResourceTarget.idealMemoryLoad()); - utilizationObject.setDouble("disk", clusterNodesTimeseries.averageLoad(Resource.disk)); - utilizationObject.setDouble("idealDisk", ResourceTarget.idealDiskLoad()); + private static void clusterUtilizationToSlime(ClusterModel clusterModel, Cursor utilizationObject) { + utilizationObject.setDouble("cpu", clusterModel.averageLoad(Resource.cpu)); + utilizationObject.setDouble("idealCpu", clusterModel.idealLoad(Resource.cpu)); + utilizationObject.setDouble("memory", clusterModel.averageLoad(Resource.memory)); + utilizationObject.setDouble("idealMemory", clusterModel.idealLoad(Resource.memory)); + utilizationObject.setDouble("disk", clusterModel.averageLoad(Resource.disk)); + utilizationObject.setDouble("idealDisk", clusterModel.idealLoad(Resource.disk)); } private static void scalingEventsToSlime(List<ScalingEvent> scalingEvents, Cursor scalingEventsArray) { diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java index 2442ff9d565..52081877d98 100644 --- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java +++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java @@ -447,6 +447,7 @@ public class NodesV2ApiHandler extends LoggingRequestHandler { Slime slime = ApplicationSerializer.toSlime(application.get(), nodeRepository.nodes().list(Node.State.active).owner(id), metricsDb, + nodeRepository, withPath("/nodes/v2/applications/" + id, uri)); return new SlimeJsonResponse(slime); } diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java index cc3eeb47073..89da20c5550 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java @@ -141,8 +141,8 @@ class AutoscalingTester { clock().advance(Duration.ofMinutes(5)); for (Node node : nodes) { float cpu = value * oneExtraNodeFactor; - float memory = (float) Resource.memory.idealAverageLoad() * otherResourcesLoad * oneExtraNodeFactor; - float disk = (float) Resource.disk.idealAverageLoad() * otherResourcesLoad * oneExtraNodeFactor; + float memory = (float) ClusterModel.idealMemoryLoad * otherResourcesLoad * oneExtraNodeFactor; + float disk = (float) ClusterModel.idealDiskLoad * otherResourcesLoad * oneExtraNodeFactor; db.addNodeMetrics(List.of(new Pair<>(node.hostname(), new NodeMetricSnapshot(clock().instant(), cpu, memory, @@ -174,7 +174,7 @@ class AutoscalingTester { for (Node node : nodes) { float cpu = (float) 0.2 * otherResourcesLoad * oneExtraNodeFactor; float memory = value * oneExtraNodeFactor; - float disk = (float) Resource.disk.idealAverageLoad() * otherResourcesLoad * oneExtraNodeFactor; + float disk = (float) ClusterModel.idealDiskLoad * otherResourcesLoad * oneExtraNodeFactor; db.addNodeMetrics(List.of(new Pair<>(node.hostname(), new NodeMetricSnapshot(clock().instant(), cpu, memory, diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTargetTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModelTest.java index 6f60de62f1f..70550b0a7c3 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTargetTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModelTest.java @@ -22,7 +22,7 @@ import static org.junit.Assert.assertEquals; /** * @author bratseth */ -public class ResourceTargetTest { +public class ClusterModelTest { private static final double delta = 0.001; @@ -34,22 +34,16 @@ public class ResourceTargetTest { application = application.with(cluster); // No current traffic share: Ideal load is low but capped - application = application.with(new Status(0.0, 1.0)); - assertEquals(0.131, - ResourceTarget.idealCpuLoad(Duration.ofMinutes(10), - timeseries(cluster,100, t -> t == 0 ? 10000.0 : 0.0, t -> 0.0, clock), - application, - clock), - delta); + var model1 = new ClusterModel(application.with(new Status(0.0, 1.0)), + cluster, clock, Duration.ofMinutes(10), + timeseries(cluster,100, t -> t == 0 ? 10000.0 : 0.0, t -> 0.0, clock)); + assertEquals(0.131, model1.idealLoad(Resource.cpu), delta); // Almost no current traffic share: Ideal load is low but capped - application = application.with(new Status(0.0001, 1.0)); - assertEquals(0.131, - ResourceTarget.idealCpuLoad(Duration.ofMinutes(10), - timeseries(cluster,100, t -> t == 0 ? 10000.0 : 0.0, t -> 0.0, clock), - application, - clock), - delta); + var model2 = new ClusterModel(application.with(new Status(0.0001, 1.0)), + cluster, clock, Duration.ofMinutes(10), + timeseries(cluster,100, t -> t == 0 ? 10000.0 : 0.0, t -> 0.0, clock)); + assertEquals(0.131, model2.idealLoad(Resource.cpu), delta); } @Test @@ -61,21 +55,16 @@ public class ResourceTargetTest { application = application.with(cluster); // No current traffic: Ideal load is low but capped - assertEquals(0.275, - ResourceTarget.idealCpuLoad(Duration.ofMinutes(10), - timeseries(cluster,100, t -> t == 0 ? 10000.0 : 0.0, t -> 0.0, clock), - application, - clock), - delta); + var model1 = new ClusterModel(application, + cluster, clock, Duration.ofMinutes(10), + timeseries(cluster,100, t -> t == 0 ? 10000.0 : 0.0, t -> 0.0, clock)); + assertEquals(0.275, model1.idealLoad(Resource.cpu), delta); // Almost current traffic: Ideal load is low but capped - application = application.with(new Status(0.0001, 1.0)); - assertEquals(0.04, - ResourceTarget.idealCpuLoad(Duration.ofMinutes(10), - timeseries(cluster,100, t -> t == 0 ? 10000.0 : 0.0001, t -> 0.0, clock), - application, - clock), - delta); + var model2 = new ClusterModel(application.with(new Status(0.0001, 1.0)), + cluster, clock, Duration.ofMinutes(10), + timeseries(cluster,100, t -> t == 0 ? 10000.0 : 0.0001, t -> 0.0, clock)); + assertEquals(0.275, model1.idealLoad(Resource.cpu), delta); } private Cluster cluster(NodeResources resources) { diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java index 9ae67cef235..10851252c98 100644 --- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java +++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java @@ -17,6 +17,7 @@ import com.yahoo.vespa.hosted.provision.Node; import com.yahoo.vespa.hosted.provision.NodeList; import com.yahoo.vespa.hosted.provision.NodeRepository; import com.yahoo.vespa.hosted.provision.applications.Cluster; +import com.yahoo.vespa.hosted.provision.autoscale.ClusterModel; import com.yahoo.vespa.hosted.provision.autoscale.NodeMetricSnapshot; import com.yahoo.vespa.hosted.provision.autoscale.MetricsDb; import com.yahoo.vespa.hosted.provision.autoscale.Resource; @@ -99,9 +100,7 @@ public class ScalingSuggestionsMaintainerTest { var suggested = tester.nodeRepository().applications().get(app1).get().cluster(cluster1.id()).get().suggestedResources().get().resources(); tester.deploy(app1, cluster1, Capacity.from(suggested, suggested, false, true)); tester.clock().advance(Duration.ofDays(2)); - addMeasurements(0.2f, - (float)Resource.memory.idealAverageLoad(), - (float)Resource.disk.idealAverageLoad(), + addMeasurements(0.2f, 0.7f, 0.6f, 0, 500, app1, tester.nodeRepository(), metricsDb); maintainer.maintain(); assertEquals("Suggestion is to keep the current allocation", |