Merge pull request #17030 from vespa-engine/bratseth/cluster-model

Bratseth/cluster model
author: Valerij Fredriksen <freva@users.noreply.github.com> 2021-03-18 13:27:14 +0100
committer: GitHub <noreply@github.com> 2021-03-18 13:27:14 +0100
commit: 7c5948aceceec2a3c279755622876932fdf5a3ed (patch)
tree: 1801a9db90991b40f6f2ff27f609d02f049e3ce0 /node-repository
parent: 2e0ad58bb9bbb5cc24289acff25046a5b9442e2b (diff)
parent: 95a73e1587180eee3a272aed8a02e865943666cc (diff)
13 files changed, 259 insertions, 205 deletions
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/Cluster.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/Cluster.java
index 69d7cec4007..59b70ff1ef0 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/Cluster.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/applications/Cluster.java
@@ -129,32 +129,6 @@ public class Cluster {
         return new Cluster(id, exclusive, min, max, suggested, target, scalingEvents, autoscalingStatus);
     }
 
-    /** The predicted duration of a rescaling of this cluster */
-    public Duration scalingDuration(ClusterSpec clusterSpec) {
-        int completedEventCount = 0;
-        Duration totalDuration = Duration.ZERO;
-        for (ScalingEvent event : scalingEvents()) {
-            if (event.duration().isEmpty()) continue;
-            completedEventCount++;
-            totalDuration = totalDuration.plus(event.duration().get());
-        }
-
-        if (completedEventCount == 0) { // Use defaults
-            if (clusterSpec.isStateful()) return Duration.ofHours(12);
-            return Duration.ofMinutes(10);
-        }
-        else {
-            Duration predictedDuration = totalDuration.dividedBy(completedEventCount);
-
-            // TODO: Remove when we have reliable completion for content clusters
-            if (clusterSpec.isStateful() && predictedDuration.minus(Duration.ofHours(12)).isNegative())
-                return Duration.ofHours(12);
-
-            if (predictedDuration.minus(Duration.ofMinutes(5)).isNegative()) return Duration.ofMinutes(5); // minimum
-            return predictedDuration;
-        }
-    }
-
     @Override
     public int hashCode() { return id.hashCode(); }
 
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java
index ac3430fecf9..9791aabf7b4 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java
@@ -59,35 +59,25 @@ public class Autoscaler {
     }
 
     private Advice autoscale(Application application, Cluster cluster, NodeList clusterNodes, Limits limits) {
-        if ( ! stable(clusterNodes, nodeRepository))
-            return Advice.none("Cluster change in progress");
+        ClusterModel clusterModel = new ClusterModel(application, cluster, clusterNodes, metricsDb, nodeRepository.clock());
 
-        Duration scalingWindow = cluster.scalingDuration(clusterNodes.clusterSpec());
-        if (scaledIn(scalingWindow, cluster))
-            return Advice.dontScale("Won't autoscale now: Less than " + scalingWindow + " since last resource change");
+        if ( ! clusterIsStable(clusterNodes, nodeRepository))
+            return Advice.none("Cluster change in progress");
 
-        var clusterNodesTimeseries = new ClusterNodesTimeseries(scalingWindow, cluster, clusterNodes, metricsDb);
-        var currentAllocation = new AllocatableClusterResources(clusterNodes.asList(), nodeRepository, cluster.exclusive());
+        if (scaledIn(clusterModel.scalingDuration(), cluster))
+            return Advice.dontScale("Won't autoscale now: Less than " + clusterModel.scalingDuration() + " since last resource change");
 
-        int measurementsPerNode = clusterNodesTimeseries.measurementsPerNode();
-        if  (measurementsPerNode < minimumMeasurementsPerNode(scalingWindow))
+        if  (clusterModel.nodeTimeseries().measurementsPerNode() < minimumMeasurementsPerNode(clusterModel.scalingDuration()))
             return Advice.none("Collecting more data before making new scaling decisions: Need to measure for " +
-                               scalingWindow + " since the last resource change completed");
+                               clusterModel.scalingDuration() + " since the last resource change completed");
 
-        int nodesMeasured = clusterNodesTimeseries.nodesMeasured();
-        if (nodesMeasured != clusterNodes.size())
+        if (clusterModel.nodeTimeseries().nodesMeasured() != clusterNodes.size())
             return Advice.none("Collecting more data before making new scaling decisions: " +
-                               "Have measurements from " + nodesMeasured + " nodes, but require from " + clusterNodes.size());
+                               "Have measurements from " + clusterModel.nodeTimeseries().nodesMeasured() +
+                               " nodes, but require from " + clusterNodes.size());
 
-
-        var scalingDuration = cluster.scalingDuration(clusterNodes.clusterSpec());
-        var clusterTimeseries = metricsDb.getClusterTimeseries(application.id(), cluster.id());
-        var target = ResourceTarget.idealLoad(scalingDuration,
-                                              clusterTimeseries,
-                                              clusterNodesTimeseries,
-                                              currentAllocation,
-                                              application,
-                                              nodeRepository.clock());
+        var currentAllocation = new AllocatableClusterResources(clusterNodes.asList(), nodeRepository, cluster.exclusive());
+        var target = ResourceTarget.idealLoad(clusterModel, currentAllocation);
 
         Optional<AllocatableClusterResources> bestAllocation =
                 allocationOptimizer.findBestAllocation(target, currentAllocation, limits);
@@ -97,13 +87,27 @@ public class Autoscaler {
         if (similar(bestAllocation.get().realResources(), currentAllocation.realResources()))
             return Advice.dontScale("Cluster is ideally scaled within configured limits");
 
-        if (isDownscaling(bestAllocation.get(), currentAllocation) && scaledIn(scalingWindow.multipliedBy(3), cluster))
-            return Advice.dontScale("Waiting " + scalingWindow.multipliedBy(3) +
+        if (isDownscaling(bestAllocation.get(), currentAllocation) && scaledIn(clusterModel.scalingDuration().multipliedBy(3), cluster))
+            return Advice.dontScale("Waiting " + clusterModel.scalingDuration().multipliedBy(3) +
                                     " since the last change before reducing resources");
 
         return Advice.scaleTo(bestAllocation.get().advertisedResources());
     }
 
+    public static boolean clusterIsStable(NodeList clusterNodes, NodeRepository nodeRepository) {
+        // The cluster is processing recent changes
+        if (clusterNodes.stream().anyMatch(node -> node.status().wantToRetire() ||
+                                                   node.allocation().get().membership().retired() ||
+                                                   node.allocation().get().isRemovable()))
+            return false;
+
+        // A deployment is ongoing
+        if (nodeRepository.nodes().list(Node.State.reserved).owner(clusterNodes.first().get().allocation().get().owner()).size() > 0)
+            return false;
+
+        return true;
+    }
+
     /** Returns true if both total real resources and total cost are similar */
     public static boolean similar(ClusterResources a, ClusterResources b) {
         return similar(a.cost(), b.cost(), costDifferenceWorthReallocation) &&
@@ -143,20 +147,6 @@ public class Autoscaler {
         return (int)minimumMeasurements;
     }
 
-    public static boolean stable(NodeList nodes, NodeRepository nodeRepository) {
-        // The cluster is processing recent changes
-        if (nodes.stream().anyMatch(node -> node.status().wantToRetire() ||
-                                            node.allocation().get().membership().retired() ||
-                                            node.allocation().get().isRemovable()))
-            return false;
-
-        // A deployment is ongoing
-        if (nodeRepository.nodes().list(Node.State.reserved).owner(nodes.first().get().allocation().get().owner()).size() > 0)
-            return false;
-
-        return true;
-    }
-
     public static class Advice {
 
         private final boolean present;
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java
new file mode 100644
index 00000000000..4fb91e8592e
--- /dev/null
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModel.java
@@ -0,0 +1,173 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.vespa.hosted.provision.autoscale;
+
+import com.yahoo.vespa.hosted.provision.NodeList;
+import com.yahoo.vespa.hosted.provision.applications.Application;
+import com.yahoo.vespa.hosted.provision.applications.Cluster;
+import com.yahoo.vespa.hosted.provision.applications.ScalingEvent;
+
+import java.time.Clock;
+import java.time.Duration;
+import java.util.OptionalDouble;
+
+/**
+ * A cluster with its associated metrics which allows prediction about its future behavior.
+ * For single-threaded, short-term usage.
+ *
+ * @author bratseth
+ */
+public class ClusterModel {
+
+    static final double idealQueryCpuLoad = 0.8;
+    static final double idealWriteCpuLoad = 0.95;
+    static final double idealMemoryLoad = 0.7;
+    static final double idealDiskLoad = 0.6;
+
+    private final Application application;
+    private final Cluster cluster;
+    private final NodeList nodes;
+    private final MetricsDb metricsDb;
+    private final Clock clock;
+    private final Duration scalingDuration;
+
+    // Lazily initialized members
+    private Double queryFractionOfMax = null;
+    private Double maxQueryGrowthRate = null;
+    private ClusterNodesTimeseries nodeTimeseries = null;
+    private ClusterTimeseries clusterTimeseries = null;
+
+    public ClusterModel(Application application,
+                        Cluster cluster,
+                        NodeList clusterNodes,
+                        MetricsDb metricsDb,
+                        Clock clock) {
+        this.application = application;
+        this.cluster = cluster;
+        this.nodes = clusterNodes;
+        this.metricsDb = metricsDb;
+        this.clock = clock;
+        this.scalingDuration = computeScalingDuration(cluster, clusterNodes);
+    }
+
+    /** For testing */
+    ClusterModel(Application application,
+                 Cluster cluster,
+                 Clock clock,
+                 Duration scalingDuration,
+                 ClusterTimeseries clusterTimeseries) {
+        this.application = application;
+        this.cluster = cluster;
+        this.nodes = null;
+        this.metricsDb = null;
+        this.clock = clock;
+
+        this.scalingDuration = scalingDuration;
+        this.clusterTimeseries = clusterTimeseries;
+    }
+
+    /** Returns the predicted duration of a rescaling of this cluster */
+    public Duration scalingDuration() { return scalingDuration; }
+
+    public ClusterNodesTimeseries nodeTimeseries() {
+        if (nodeTimeseries != null) return nodeTimeseries;
+        return nodeTimeseries = new ClusterNodesTimeseries(scalingDuration(), cluster, nodes, metricsDb);
+    }
+
+    public ClusterTimeseries clusterTimeseries() {
+        if (clusterTimeseries != null) return clusterTimeseries;
+        return clusterTimeseries = metricsDb.getClusterTimeseries(application.id(), cluster.id());
+    }
+
+    /**
+     * Returns the predicted max query growth rate per minute as a fraction of the average traffic
+     * in the scaling window
+     */
+    public double maxQueryGrowthRate() {
+        if (maxQueryGrowthRate != null) return maxQueryGrowthRate;
+        return maxQueryGrowthRate = clusterTimeseries().maxQueryGrowthRate(scalingDuration(), clock);
+    }
+
+    /** Returns the average query rate in the scaling window as a fraction of the max observed query rate */
+    public double queryFractionOfMax() {
+        if (queryFractionOfMax != null) return queryFractionOfMax;
+        return queryFractionOfMax = clusterTimeseries().queryFractionOfMax(scalingDuration(), clock);
+    }
+
+    public double averageLoad(Resource resource) { return nodeTimeseries().averageLoad(resource); }
+
+    public double idealLoad(Resource resource) {
+        switch (resource) {
+            case cpu : return idealCpuLoad();
+            case memory : return idealMemoryLoad;
+            case disk : return idealDiskLoad;
+            default : throw new IllegalStateException("No ideal load defined for " + resource);
+        }
+    }
+
+    /** Ideal cpu load must take the application traffic fraction into account */
+    private double idealCpuLoad() {
+        double queryCpuFraction = queryCpuFraction();
+
+        // What's needed to have headroom for growth during scale-up as a fraction of current resources?
+        double growthRateHeadroom = 1 + maxQueryGrowthRate() * scalingDuration().toMinutes();
+        // Cap headroom at 10% above the historical observed peak
+        if (queryFractionOfMax() != 0)
+            growthRateHeadroom = Math.min(growthRateHeadroom, 1 / queryFractionOfMax() + 0.1);
+
+        // How much headroom is needed to handle sudden arrival of additional traffic due to another zone going down?
+        double maxTrafficShiftHeadroom = 10.0; // Cap to avoid extreme sizes from a current very small share
+        double trafficShiftHeadroom;
+        if (application.status().maxReadShare() == 0) // No traffic fraction data
+            trafficShiftHeadroom = 2.0; // assume we currently get half of the global share of traffic
+        else if (application.status().currentReadShare() == 0)
+            trafficShiftHeadroom = maxTrafficShiftHeadroom;
+        else
+            trafficShiftHeadroom = application.status().maxReadShare() / application.status().currentReadShare();
+        trafficShiftHeadroom = Math.min(trafficShiftHeadroom, maxTrafficShiftHeadroom);
+
+        // Assumptions: 1) Write load is not organic so we should not grow to handle more.
+        //                 (TODO: But allow applications to set their target write rate and size for that)
+        //              2) Write load does not change in BCP scenarios.
+        return queryCpuFraction * 1 / growthRateHeadroom * 1 / trafficShiftHeadroom * idealQueryCpuLoad +
+               (1 - queryCpuFraction) * idealWriteCpuLoad;
+    }
+
+    private double queryCpuFraction() {
+        OptionalDouble queryRate = clusterTimeseries().queryRate(scalingDuration(), clock);
+        OptionalDouble writeRate = clusterTimeseries().writeRate(scalingDuration(), clock);
+        if (queryRate.orElse(0) == 0 && writeRate.orElse(0) == 0) return queryCpuFraction(0.5);
+        return queryCpuFraction(queryRate.orElse(0) / (queryRate.orElse(0) + writeRate.orElse(0)));
+    }
+
+    private double queryCpuFraction(double queryFraction) {
+        double relativeQueryCost = 9; // How much more expensive are queries than writes? TODO: Measure
+        double writeFraction = 1 - queryFraction;
+        return queryFraction * relativeQueryCost / (queryFraction * relativeQueryCost + writeFraction);
+    }
+
+    private static Duration computeScalingDuration(Cluster cluster, NodeList nodes) {
+        int completedEventCount = 0;
+        Duration totalDuration = Duration.ZERO;
+        for (ScalingEvent event : cluster.scalingEvents()) {
+            if (event.duration().isEmpty()) continue;
+            completedEventCount++;
+            totalDuration = totalDuration.plus(event.duration().get());
+        }
+
+        if (completedEventCount == 0) { // Use defaults
+            if (nodes.clusterSpec().isStateful()) return Duration.ofHours(12);
+            return Duration.ofMinutes(10);
+        }
+        else {
+            Duration predictedDuration = totalDuration.dividedBy(completedEventCount);
+
+            // TODO: Remove when we have reliable completion for content clusters
+            if (nodes.clusterSpec().isStateful() && predictedDuration.minus(Duration.ofHours(12)).isNegative())
+                return Duration.ofHours(12);
+
+            if (predictedDuration.minus(Duration.ofMinutes(5)).isNegative()) return Duration.ofMinutes(5); // minimum
+            return predictedDuration;
+        }
+    }
+
+}
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterNodesTimeseries.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterNodesTimeseries.java
index 2d0e77742ec..c097abd8208 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterNodesTimeseries.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterNodesTimeseries.java
@@ -40,9 +40,7 @@ public class ClusterNodesTimeseries {
     }
 
     /** Returns the number of nodes measured in this */
-    public int nodesMeasured() {
-        return timeseries.size();
-    }
+    public int nodesMeasured() { return timeseries.size(); }
 
     /** Returns the average load of this resource in this */
     public double averageLoad(Resource resource) {
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java
index e12e5442c52..79e2d3b3398 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterTimeseries.java
@@ -22,8 +22,6 @@ public class ClusterTimeseries {
     private final ClusterSpec.Id cluster;
     private final List<ClusterMetricSnapshot> snapshots;
 
-    private Double cachedMaxQueryGrowthRate = null;
-
     ClusterTimeseries(ClusterSpec.Id cluster, List<ClusterMetricSnapshot> snapshots) {
         this.cluster = cluster;
         List<ClusterMetricSnapshot> sortedSnapshots = new ArrayList<>(snapshots);
@@ -51,12 +49,6 @@ public class ClusterTimeseries {
      * The max query growth rate we can predict from this time-series as a fraction of the average traffic in the window
      */
     public double maxQueryGrowthRate(Duration window, Clock clock) {
-        if (cachedMaxQueryGrowthRate != null)
-            return cachedMaxQueryGrowthRate;
-        return cachedMaxQueryGrowthRate = computeMaxQueryGrowthRate(window, clock);
-    }
-
-    private double computeMaxQueryGrowthRate(Duration window, Clock clock) {
         if (snapshots.isEmpty()) return 0.1;
         // Find the period having the highest growth rate, where total growth exceeds 30% increase
         double maxGrowthRate = 0; // In query rate per minute
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java
index c8de94eeab8..b3cf6c1e962 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/MetricsResponse.java
@@ -103,7 +103,7 @@ public class MetricsResponse {
 
     private boolean clusterIsStable(Node node, NodeList applicationNodes, NodeRepository nodeRepository) {
         ClusterSpec cluster = node.allocation().get().membership().cluster();
-        return Autoscaler.stable(applicationNodes.cluster(cluster.id()), nodeRepository);
+        return Autoscaler.clusterIsStable(applicationNodes.cluster(cluster.id()), nodeRepository);
     }
 
     public static MetricsResponse empty() { return new MetricsResponse(List.of()); }
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Resource.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Resource.java
index b841b31833f..c639ad1f779 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Resource.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Resource.java
@@ -12,25 +12,19 @@ public enum Resource {
 
     /** Cpu utilization ratio */
     cpu {
-        public double idealAverageLoad() { return 0.8; }
         double valueFrom(NodeResources resources) { return resources.vcpu(); }
     },
 
     /** Memory utilization ratio */
     memory {
-        public double idealAverageLoad() { return 0.7; }
         double valueFrom(NodeResources resources) { return resources.memoryGb(); }
     },
 
     /** Disk utilization ratio */
     disk {
-        public double idealAverageLoad() { return 0.6; }
         double valueFrom(NodeResources resources) { return resources.diskGb(); }
     };
 
-    /** The load we should have of this resource on average, when one node in the cluster is down */
-    public abstract double idealAverageLoad();
-
     abstract double valueFrom(NodeResources resources);
 
 }
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTarget.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTarget.java
index 9f6a4fc77cd..b1a1e86b08d 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTarget.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTarget.java
@@ -51,18 +51,14 @@ public class ResourceTarget {
     }
 
     /** Create a target of achieving ideal load given a current load */
-    public static ResourceTarget idealLoad(Duration scalingDuration,
-                                           ClusterTimeseries clusterTimeseries,
-                                           ClusterNodesTimeseries clusterNodesTimeseries,
-                                           AllocatableClusterResources current,
-                                           Application application,
-                                           Clock clock) {
-        return new ResourceTarget(nodeUsage(Resource.cpu, clusterNodesTimeseries.averageLoad(Resource.cpu), current)
-                                  / idealCpuLoad(scalingDuration, clusterTimeseries, application, clock),
-                                  nodeUsage(Resource.memory, clusterNodesTimeseries.averageLoad(Resource.memory), current)
-                                  / Resource.memory.idealAverageLoad(),
-                                  nodeUsage(Resource.disk, clusterNodesTimeseries.averageLoad(Resource.disk), current)
-                                  / Resource.disk.idealAverageLoad(),
+    public static ResourceTarget idealLoad(ClusterModel clusterModel,
+                                           AllocatableClusterResources current) {
+        return new ResourceTarget(nodeUsage(Resource.cpu, clusterModel.averageLoad(Resource.cpu), current)
+                                  / clusterModel.idealLoad(Resource.cpu),
+                                  nodeUsage(Resource.memory, clusterModel.averageLoad(Resource.memory), current)
+                                  / clusterModel.idealLoad(Resource.memory),
+                                  nodeUsage(Resource.disk, clusterModel.averageLoad(Resource.disk), current)
+                                  / clusterModel.idealLoad(Resource.disk),
                                   true);
     }
 
@@ -74,58 +70,4 @@ public class ResourceTarget {
                                   false);
     }
 
-    /** Ideal cpu load must take the application traffic fraction into account */
-    public static double idealCpuLoad(Duration scalingDuration,
-                                      ClusterTimeseries clusterTimeseries,
-                                      Application application,
-                                      Clock clock) {
-        double queryCpuFraction = queryCpuFraction(clusterTimeseries, scalingDuration, clock);
-
-        // What's needed to have headroom for growth during scale-up as a fraction of current resources?
-        double maxGrowthRate = clusterTimeseries.maxQueryGrowthRate(scalingDuration, clock); // in fraction per minute of the current traffic
-        double growthRateHeadroom = 1 + maxGrowthRate * scalingDuration.toMinutes();
-        // Cap headroom at 10% above the historical observed peak
-        double fractionOfMax = clusterTimeseries.queryFractionOfMax(scalingDuration, clock);
-        if (fractionOfMax != 0)
-            growthRateHeadroom = Math.min(growthRateHeadroom, 1 / fractionOfMax + 0.1);
-
-        // How much headroom is needed to handle sudden arrival of additional traffic due to another zone going down?
-        double maxTrafficShiftHeadroom = 10.0; // Cap to avoid extreme sizes from a current very small share
-        double trafficShiftHeadroom;
-        if (application.status().maxReadShare() == 0) // No traffic fraction data
-            trafficShiftHeadroom = 2.0; // assume we currently get half of the global share of traffic
-        else if (application.status().currentReadShare() == 0)
-            trafficShiftHeadroom = maxTrafficShiftHeadroom;
-        else
-            trafficShiftHeadroom = application.status().maxReadShare() / application.status().currentReadShare();
-        trafficShiftHeadroom = Math.min(trafficShiftHeadroom, maxTrafficShiftHeadroom);
-
-        // Assumptions: 1) Write load is not organic so we should not grow to handle more.
-        //                 (TODO: But allow applications to set their target write rate and size for that)
-        //              2) Write load does not change in BCP scenarios.
-        return queryCpuFraction * 1 / growthRateHeadroom * 1 / trafficShiftHeadroom * idealQueryCpuLoad() +
-               (1 - queryCpuFraction) * idealWriteCpuLoad();
-    }
-    
-    private static double queryCpuFraction(ClusterTimeseries clusterTimeseries, Duration scalingDuration, Clock clock) {
-        OptionalDouble queryRate = clusterTimeseries.queryRate(scalingDuration, clock);
-        OptionalDouble writeRate = clusterTimeseries.writeRate(scalingDuration, clock);
-        if (queryRate.orElse(0) == 0 && writeRate.orElse(0) == 0) return queryCpuFraction(0.5);
-        return queryCpuFraction(queryRate.orElse(0) / (queryRate.orElse(0) + writeRate.orElse(0)));
-    }
-
-    private static double queryCpuFraction(double queryFraction) {
-        double relativeQueryCost = 9; // How much more expensive are queries than writes? TODO: Measure
-        double writeFraction = 1 - queryFraction;
-        return queryFraction * relativeQueryCost / (queryFraction * relativeQueryCost + writeFraction);
-    }
-
-    public static double idealQueryCpuLoad() { return Resource.cpu.idealAverageLoad(); }
-
-    public static double idealWriteCpuLoad() { return 0.95; }
-
-    public static double idealMemoryLoad() { return Resource.memory.idealAverageLoad(); }
-
-    public static double idealDiskLoad() { return Resource.disk.idealAverageLoad(); }
-
 }
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/ApplicationSerializer.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/ApplicationSerializer.java
index 8d8d7e01049..cc59860384b 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/ApplicationSerializer.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/ApplicationSerializer.java
@@ -7,9 +7,11 @@ import com.yahoo.slime.Cursor;
 import com.yahoo.slime.Slime;
 import com.yahoo.vespa.hosted.provision.Node;
 import com.yahoo.vespa.hosted.provision.NodeList;
+import com.yahoo.vespa.hosted.provision.NodeRepository;
 import com.yahoo.vespa.hosted.provision.applications.Application;
 import com.yahoo.vespa.hosted.provision.applications.Cluster;
 import com.yahoo.vespa.hosted.provision.applications.ScalingEvent;
+import com.yahoo.vespa.hosted.provision.autoscale.ClusterModel;
 import com.yahoo.vespa.hosted.provision.autoscale.ClusterNodesTimeseries;
 import com.yahoo.vespa.hosted.provision.autoscale.ClusterTimeseries;
 import com.yahoo.vespa.hosted.provision.autoscale.MetricsDb;
@@ -29,40 +31,45 @@ import java.util.List;
  */
 public class ApplicationSerializer {
 
-    public static Slime toSlime(Application application, NodeList applicationNodes, MetricsDb metricsDb, URI applicationUri) {
+    public static Slime toSlime(Application application,
+                                NodeList applicationNodes,
+                                MetricsDb metricsDb,
+                                NodeRepository nodeRepository,
+                                URI applicationUri) {
         Slime slime = new Slime();
-        toSlime(application, applicationNodes, metricsDb, slime.setObject(), applicationUri);
+        toSlime(application, applicationNodes, metricsDb, nodeRepository, slime.setObject(), applicationUri);
         return slime;
     }
 
     private static void toSlime(Application application,
                                 NodeList applicationNodes,
                                 MetricsDb metricsDb,
+                                NodeRepository nodeRepository,
                                 Cursor object,
                                 URI applicationUri) {
         object.setString("url", applicationUri.toString());
         object.setString("id", application.id().toFullString());
-        clustersToSlime(application, applicationNodes, metricsDb, object.setObject("clusters"));
+        clustersToSlime(application, applicationNodes, metricsDb, nodeRepository, object.setObject("clusters"));
     }
 
     private static void clustersToSlime(Application application,
                                         NodeList applicationNodes,
                                         MetricsDb metricsDb,
+                                        NodeRepository nodeRepository,
                                         Cursor clustersObject) {
-        application.clusters().values().forEach(cluster -> toSlime(application, cluster, applicationNodes, metricsDb, clustersObject));
+        application.clusters().values().forEach(cluster -> toSlime(application, cluster, applicationNodes, metricsDb, nodeRepository, clustersObject));
     }
 
     private static void toSlime(Application application,
                                 Cluster cluster,
                                 NodeList applicationNodes,
                                 MetricsDb metricsDb,
+                                NodeRepository nodeRepository,
                                 Cursor clustersObject) {
         NodeList nodes = applicationNodes.not().retired().cluster(cluster.id());
         if (nodes.isEmpty()) return;
         ClusterResources currentResources = nodes.toResources();
-        Duration scalingDuration = cluster.scalingDuration(nodes.clusterSpec());
-        var clusterNodesTimeseries = new ClusterNodesTimeseries(Duration.ofHours(1), cluster, nodes, metricsDb);
-        var clusterTimeseries = metricsDb.getClusterTimeseries(application.id(), cluster.id());
+        ClusterModel clusterModel = new ClusterModel(application, cluster, nodes, metricsDb, nodeRepository.clock());
 
         Cursor clusterObject = clustersObject.setObject(cluster.id().value());
         clusterObject.setString("type", nodes.clusterSpec().type().name());
@@ -72,12 +79,12 @@ public class ApplicationSerializer {
         if (cluster.shouldSuggestResources(currentResources))
             cluster.suggestedResources().ifPresent(suggested -> toSlime(suggested.resources(), clusterObject.setObject("suggested")));
         cluster.targetResources().ifPresent(target -> toSlime(target, clusterObject.setObject("target")));
-        clusterUtilizationToSlime(application, scalingDuration, clusterTimeseries, clusterNodesTimeseries, metricsDb.clock(), clusterObject.setObject("utilization"));
+        clusterUtilizationToSlime(clusterModel, clusterObject.setObject("utilization"));
         scalingEventsToSlime(cluster.scalingEvents(), clusterObject.setArray("scalingEvents"));
         clusterObject.setString("autoscalingStatus", cluster.autoscalingStatus());
-        clusterObject.setLong("scalingDuration", scalingDuration.toMillis());
-        clusterObject.setDouble("maxQueryGrowthRate", clusterTimeseries.maxQueryGrowthRate(scalingDuration, metricsDb.clock()));
-        clusterObject.setDouble("currentQueryFractionOfMax", clusterTimeseries.queryFractionOfMax(scalingDuration, metricsDb.clock()));
+        clusterObject.setLong("scalingDuration", clusterModel.scalingDuration().toMillis());
+        clusterObject.setDouble("maxQueryGrowthRate", clusterModel.maxQueryGrowthRate());
+        clusterObject.setDouble("currentQueryFractionOfMax", clusterModel.queryFractionOfMax());
     }
 
     private static void toSlime(ClusterResources resources, Cursor clusterResourcesObject) {
@@ -86,18 +93,13 @@ public class ApplicationSerializer {
         NodeResourcesSerializer.toSlime(resources.nodeResources(), clusterResourcesObject.setObject("resources"));
     }
 
-    private static void clusterUtilizationToSlime(Application application,
-                                                  Duration scalingDuration,
-                                                  ClusterTimeseries clusterTimeseries,
-                                                  ClusterNodesTimeseries clusterNodesTimeseries,
-                                                  Clock clock,
-                                                  Cursor utilizationObject) {
-        utilizationObject.setDouble("cpu", clusterNodesTimeseries.averageLoad(Resource.cpu));
-        utilizationObject.setDouble("idealCpu", ResourceTarget.idealCpuLoad(scalingDuration, clusterTimeseries, application, clock));
-        utilizationObject.setDouble("memory", clusterNodesTimeseries.averageLoad(Resource.memory));
-        utilizationObject.setDouble("idealMemory", ResourceTarget.idealMemoryLoad());
-        utilizationObject.setDouble("disk", clusterNodesTimeseries.averageLoad(Resource.disk));
-        utilizationObject.setDouble("idealDisk", ResourceTarget.idealDiskLoad());
+    private static void clusterUtilizationToSlime(ClusterModel clusterModel, Cursor utilizationObject) {
+        utilizationObject.setDouble("cpu", clusterModel.averageLoad(Resource.cpu));
+        utilizationObject.setDouble("idealCpu", clusterModel.idealLoad(Resource.cpu));
+        utilizationObject.setDouble("memory", clusterModel.averageLoad(Resource.memory));
+        utilizationObject.setDouble("idealMemory", clusterModel.idealLoad(Resource.memory));
+        utilizationObject.setDouble("disk", clusterModel.averageLoad(Resource.disk));
+        utilizationObject.setDouble("idealDisk", clusterModel.idealLoad(Resource.disk));
     }
 
     private static void scalingEventsToSlime(List<ScalingEvent> scalingEvents, Cursor scalingEventsArray) {
diff --git a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java
index 2442ff9d565..52081877d98 100644
--- a/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java
+++ b/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/restapi/NodesV2ApiHandler.java
@@ -447,6 +447,7 @@ public class NodesV2ApiHandler extends LoggingRequestHandler {
         Slime slime = ApplicationSerializer.toSlime(application.get(),
                                                     nodeRepository.nodes().list(Node.State.active).owner(id),
                                                     metricsDb,
+                                                    nodeRepository,
                                                     withPath("/nodes/v2/applications/" + id, uri));
         return new SlimeJsonResponse(slime);
     }
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java
index cc3eeb47073..89da20c5550 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/AutoscalingTester.java
@@ -141,8 +141,8 @@ class AutoscalingTester {
             clock().advance(Duration.ofMinutes(5));
             for (Node node : nodes) {
                 float cpu = value * oneExtraNodeFactor;
-                float memory  = (float) Resource.memory.idealAverageLoad() * otherResourcesLoad * oneExtraNodeFactor;
-                float disk = (float) Resource.disk.idealAverageLoad() * otherResourcesLoad * oneExtraNodeFactor;
+                float memory  = (float) ClusterModel.idealMemoryLoad * otherResourcesLoad * oneExtraNodeFactor;
+                float disk = (float) ClusterModel.idealDiskLoad * otherResourcesLoad * oneExtraNodeFactor;
                 db.addNodeMetrics(List.of(new Pair<>(node.hostname(), new NodeMetricSnapshot(clock().instant(),
                                                                                              cpu,
                                                                                              memory,
@@ -174,7 +174,7 @@ class AutoscalingTester {
             for (Node node : nodes) {
                 float cpu  = (float) 0.2 * otherResourcesLoad * oneExtraNodeFactor;
                 float memory = value * oneExtraNodeFactor;
-                float disk = (float) Resource.disk.idealAverageLoad() * otherResourcesLoad * oneExtraNodeFactor;
+                float disk = (float) ClusterModel.idealDiskLoad * otherResourcesLoad * oneExtraNodeFactor;
                 db.addNodeMetrics(List.of(new Pair<>(node.hostname(), new NodeMetricSnapshot(clock().instant(),
                                                                                              cpu,
                                                                                              memory,
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTargetTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModelTest.java
index 6f60de62f1f..70550b0a7c3 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceTargetTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/autoscale/ClusterModelTest.java
@@ -22,7 +22,7 @@ import static org.junit.Assert.assertEquals;
 /**
  * @author bratseth
  */
-public class ResourceTargetTest {
+public class ClusterModelTest {
 
     private static final double delta = 0.001;
 
@@ -34,22 +34,16 @@ public class ResourceTargetTest {
         application = application.with(cluster);
 
         // No current traffic share: Ideal load is low but capped
-        application = application.with(new Status(0.0, 1.0));
-        assertEquals(0.131,
-                     ResourceTarget.idealCpuLoad(Duration.ofMinutes(10),
-                                                 timeseries(cluster,100, t -> t == 0 ? 10000.0 : 0.0, t -> 0.0, clock),
-                                                 application,
-                                                 clock),
-                     delta);
+        var model1 = new ClusterModel(application.with(new Status(0.0, 1.0)),
+                                      cluster, clock, Duration.ofMinutes(10),
+                                      timeseries(cluster,100, t -> t == 0 ? 10000.0 : 0.0, t -> 0.0, clock));
+        assertEquals(0.131, model1.idealLoad(Resource.cpu), delta);
 
         // Almost no current traffic share: Ideal load is low but capped
-        application = application.with(new Status(0.0001, 1.0));
-        assertEquals(0.131,
-                     ResourceTarget.idealCpuLoad(Duration.ofMinutes(10),
-                                                 timeseries(cluster,100, t -> t == 0 ? 10000.0 : 0.0, t -> 0.0, clock),
-                                                 application,
-                                                 clock),
-                     delta);
+        var model2 = new ClusterModel(application.with(new Status(0.0001, 1.0)),
+                                      cluster, clock, Duration.ofMinutes(10),
+                                      timeseries(cluster,100, t -> t == 0 ? 10000.0 : 0.0, t -> 0.0, clock));
+        assertEquals(0.131, model2.idealLoad(Resource.cpu), delta);
     }
 
     @Test
@@ -61,21 +55,16 @@ public class ResourceTargetTest {
         application = application.with(cluster);
 
         // No current traffic: Ideal load is low but capped
-        assertEquals(0.275,
-                     ResourceTarget.idealCpuLoad(Duration.ofMinutes(10),
-                                                 timeseries(cluster,100, t -> t == 0 ? 10000.0 : 0.0, t -> 0.0, clock),
-                                                 application,
-                                                 clock),
-                     delta);
+        var model1 = new ClusterModel(application,
+                                      cluster, clock, Duration.ofMinutes(10),
+                                      timeseries(cluster,100, t -> t == 0 ? 10000.0 : 0.0, t -> 0.0, clock));
+        assertEquals(0.275, model1.idealLoad(Resource.cpu), delta);
 
         // Almost current traffic: Ideal load is low but capped
-        application = application.with(new Status(0.0001, 1.0));
-        assertEquals(0.04,
-                     ResourceTarget.idealCpuLoad(Duration.ofMinutes(10),
-                                                 timeseries(cluster,100, t -> t == 0 ? 10000.0 : 0.0001, t -> 0.0, clock),
-                                                 application,
-                                                 clock),
-                     delta);
+        var model2 = new ClusterModel(application.with(new Status(0.0001, 1.0)),
+                                      cluster, clock, Duration.ofMinutes(10),
+                                      timeseries(cluster,100, t -> t == 0 ? 10000.0 : 0.0001, t -> 0.0, clock));
+        assertEquals(0.275, model1.idealLoad(Resource.cpu), delta);
     }
 
     private Cluster cluster(NodeResources resources) {
diff --git a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java
index 9ae67cef235..10851252c98 100644
--- a/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java
+++ b/node-repository/src/test/java/com/yahoo/vespa/hosted/provision/maintenance/ScalingSuggestionsMaintainerTest.java
@@ -17,6 +17,7 @@ import com.yahoo.vespa.hosted.provision.Node;
 import com.yahoo.vespa.hosted.provision.NodeList;
 import com.yahoo.vespa.hosted.provision.NodeRepository;
 import com.yahoo.vespa.hosted.provision.applications.Cluster;
+import com.yahoo.vespa.hosted.provision.autoscale.ClusterModel;
 import com.yahoo.vespa.hosted.provision.autoscale.NodeMetricSnapshot;
 import com.yahoo.vespa.hosted.provision.autoscale.MetricsDb;
 import com.yahoo.vespa.hosted.provision.autoscale.Resource;
@@ -99,9 +100,7 @@ public class ScalingSuggestionsMaintainerTest {
         var suggested = tester.nodeRepository().applications().get(app1).get().cluster(cluster1.id()).get().suggestedResources().get().resources();
         tester.deploy(app1, cluster1, Capacity.from(suggested, suggested, false, true));
         tester.clock().advance(Duration.ofDays(2));
-        addMeasurements(0.2f,
-                        (float)Resource.memory.idealAverageLoad(),
-                        (float)Resource.disk.idealAverageLoad(),
+        addMeasurements(0.2f, 0.7f, 0.6f,
                         0, 500, app1, tester.nodeRepository(), metricsDb);
         maintainer.maintain();
         assertEquals("Suggestion is to keep the current allocation",
author	Valerij Fredriksen <freva@users.noreply.github.com>	2021-03-18 13:27:14 +0100
committer	GitHub <noreply@github.com>	2021-03-18 13:27:14 +0100
commit	7c5948aceceec2a3c279755622876932fdf5a3ed (patch)
tree	1801a9db90991b40f6f2ff27f609d02f049e3ce0 /node-repository
parent	2e0ad58bb9bbb5cc24289acff25046a5b9442e2b (diff)
parent	95a73e1587180eee3a272aed8a02e865943666cc (diff)