aboutsummaryrefslogtreecommitdiffstats
path: root/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java
blob: 2d192fae11fd12cd37918732de47731b2cadc742 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.provision.autoscale;

import com.yahoo.config.provision.ClusterResources;
import com.yahoo.config.provision.ClusterSpec;
import com.yahoo.config.provision.NodeResources;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeList;
import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.applications.Application;
import com.yahoo.vespa.hosted.provision.applications.Cluster;
import com.yahoo.vespa.hosted.provision.applications.ScalingEvent;

import java.time.Duration;
import java.time.Instant;
import java.util.Objects;
import java.util.Optional;

/**
 * The autoscaler gives advice about what resources should be allocated to a cluster based on observed behavior.
 *
 * @author bratseth
 */
public class Autoscaler {

    /** What cost difference factor is worth a reallocation? */
    private static final double costDifferenceWorthReallocation = 0.1;
    /** What difference factor for a resource is worth a reallocation? */
    private static final double resourceDifferenceWorthReallocation = 0.1;

    private final MetricsDb metricsDb;
    private final NodeRepository nodeRepository;
    private final AllocationOptimizer allocationOptimizer;

    public Autoscaler(MetricsDb metricsDb, NodeRepository nodeRepository) {
        this.metricsDb = metricsDb;
        this.nodeRepository = nodeRepository;
        this.allocationOptimizer = new AllocationOptimizer(nodeRepository);
    }

    /**
     * Suggest a scaling of a cluster. This returns a better allocation (if found)
     * without taking min and max limits into account.
     *
     * @param clusterNodes the list of all the active nodes in a cluster
     * @return scaling advice for this cluster
     */
    public Advice suggest(Application application, Cluster cluster, NodeList clusterNodes) {
        return autoscale(application, cluster, clusterNodes, Limits.empty());
    }

    /**
     * Autoscale a cluster by load. This returns a better allocation (if found) inside the min and max limits.
     *
     * @param clusterNodes the list of all the active nodes in a cluster
     * @return scaling advice for this cluster
     */
    public Advice autoscale(Application application, Cluster cluster, NodeList clusterNodes) {
        if (cluster.minResources().equals(cluster.maxResources())) return Advice.none("Autoscaling is not enabled");
        return autoscale(application, cluster, clusterNodes, Limits.of(cluster));
    }

    private Advice autoscale(Application application, Cluster cluster, NodeList clusterNodes, Limits limits) {
        if ( ! stable(clusterNodes, nodeRepository))
            return Advice.none("Cluster change in progress");

        Duration scalingWindow = scalingWindow(clusterNodes.clusterSpec(), cluster);
        if (scaledIn(scalingWindow, cluster))
            return Advice.dontScale("Won't autoscale now: Less than " + scalingWindow + " since last rescaling");

        ClusterTimeseries clusterTimeseries =
                new ClusterTimeseries(scalingWindow, cluster, clusterNodes, metricsDb);
        AllocatableClusterResources currentAllocation =
                new AllocatableClusterResources(clusterNodes.asList(), nodeRepository, cluster.exclusive());

        int measurementsPerNode = clusterTimeseries.measurementsPerNode();
        if  (measurementsPerNode < minimumMeasurementsPerNode(scalingWindow))
            return Advice.none("Collecting more data before making new scaling decisions: " +
                               "Have " + measurementsPerNode + " measurements per node but require " +
                               minimumMeasurementsPerNode(scalingWindow));

        int nodesMeasured = clusterTimeseries.nodesMeasured();
        if (nodesMeasured != clusterNodes.size())
            return Advice.none("Collecting more data before making new scaling decisions: " +
                               "Have measurements from " + nodesMeasured + " but require from " + clusterNodes.size());

        double cpuLoad    = clusterTimeseries.averageLoad(Resource.cpu);
        double memoryLoad = clusterTimeseries.averageLoad(Resource.memory);
        double diskLoad   = clusterTimeseries.averageLoad(Resource.disk);

        var target = ResourceTarget.idealLoad(cpuLoad, memoryLoad, diskLoad, currentAllocation, application);

        Optional<AllocatableClusterResources> bestAllocation =
                allocationOptimizer.findBestAllocation(target, currentAllocation, limits);
        if (bestAllocation.isEmpty())
            return Advice.dontScale("No allocation changes are possible within configured limits");

        if (similar(bestAllocation.get().realResources(), currentAllocation.realResources()))
            return Advice.dontScale("Cluster is ideally scaled within configured limits");

        if (isDownscaling(bestAllocation.get(), currentAllocation) && scaledIn(scalingWindow.multipliedBy(3), cluster))
            return Advice.dontScale("Waiting " + scalingWindow.multipliedBy(3) + " since last rescaling before reducing resources");

        return Advice.scaleTo(bestAllocation.get().advertisedResources());
    }

    /** Returns true if both total real resources and total cost are similar */
    public static boolean similar(ClusterResources a, ClusterResources b) {
        return similar(a.cost(), b.cost(), costDifferenceWorthReallocation) &&
               similar(a.totalResources().vcpu(), b.totalResources().vcpu(), resourceDifferenceWorthReallocation) &&
               similar(a.totalResources().memoryGb(), b.totalResources().memoryGb(), resourceDifferenceWorthReallocation) &&
               similar(a.totalResources().diskGb(), b.totalResources().diskGb(), resourceDifferenceWorthReallocation);
    }

    private static boolean similar(double r1, double r2, double threshold) {
        return Math.abs(r1 - r2) / (( r1 + r2) / 2) < threshold;
    }

    /** Returns true if this reduces total resources in any dimension */
    private boolean isDownscaling(AllocatableClusterResources target, AllocatableClusterResources current) {
        NodeResources targetTotal = target.advertisedResources().totalResources();
        NodeResources currentTotal = current.advertisedResources().totalResources();
        return ! targetTotal.justNumbers().satisfies(currentTotal.justNumbers());
    }

    private boolean scaledIn(Duration delay, Cluster cluster) {
        return cluster.lastScalingEvent().map(event -> event.at()).orElse(Instant.MIN)
                      .isAfter(nodeRepository.clock().instant().minus(delay));
    }

    /** The duration of the window we need to consider to make a scaling decision. See also minimumMeasurementsPerNode */
    private Duration scalingWindow(ClusterSpec clusterSpec, Cluster cluster) {
        int completedEventCount = 0;
        Duration totalDuration = Duration.ZERO;
        for (ScalingEvent event : cluster.scalingEvents()) {
            if (event.duration().isEmpty()) continue;
            completedEventCount++;
            totalDuration = totalDuration.plus(event.duration().get());
        }

        if (completedEventCount == 0) { // Use defaults
            if (clusterSpec.isStateful()) return Duration.ofHours(12);
            return Duration.ofMinutes(10);
        }
        else {
            Duration predictedDuration = totalDuration.dividedBy(completedEventCount);

            // TODO: Remove when we have reliable completion for content clusters
            if (clusterSpec.isStateful() && predictedDuration.minus(Duration.ofHours(12)).isNegative())
                return Duration.ofHours(12);

            if (predictedDuration.minus(Duration.ofMinutes(5)).isNegative()) return Duration.ofMinutes(5); // minimum
            return predictedDuration;
        }
    }

    static Duration maxScalingWindow() {
        return Duration.ofHours(48);
    }

    /** Returns the minimum measurements per node (average) we require to give autoscaling advice.*/
    private int minimumMeasurementsPerNode(Duration scalingWindow) {
        // Measurements are ideally taken every minute, but no guarantees
        // (network, nodes may be down, collecting is single threaded and may take longer than 1 minute to complete).
        // Since the metric window is 5 minutes, we won't really improve from measuring more often.
        long minimumMeasurements = scalingWindow.toMinutes() / 5;
        minimumMeasurements = Math.round(0.8 * minimumMeasurements); // Allow 20% metrics collection blackout
        if (minimumMeasurements < 1) minimumMeasurements = 1;
        return (int)minimumMeasurements;
    }

    public static boolean stable(NodeList nodes, NodeRepository nodeRepository) {
        // The cluster is processing recent changes
        if (nodes.stream().anyMatch(node -> node.status().wantToRetire() ||
                                            node.allocation().get().membership().retired() ||
                                            node.allocation().get().isRemovable()))
            return false;

        // A deployment is ongoing
        if (nodeRepository.nodes().list(Node.State.reserved).owner(nodes.first().get().allocation().get().owner()).size() > 0)
            return false;

        return true;
    }

    public static class Advice {

        private final boolean present;
        private final Optional<ClusterResources> target;
        private final String reason;

        private Advice(Optional<ClusterResources> target, boolean present, String reason) {
            this.target = target;
            this.present = present;
            this.reason = Objects.requireNonNull(reason);
        }

        /**
         * Returns the autoscaling target that should be set by this advice.
         * This is empty if the advice is to keep the current allocation.
         */
        public Optional<ClusterResources> target() { return target; }

        /** True if this does not provide any advice */
        public boolean isEmpty() { return ! present; }

        /** True if this provides advice (which may be to keep the current allocation) */
        public boolean isPresent() { return present; }

        /** The reason for this advice */
        public String reason() { return reason; }

        private static Advice none(String reason) { return new Advice(Optional.empty(), false, reason); }
        private static Advice dontScale(String reason) { return new Advice(Optional.empty(), true, reason); }
        private static Advice scaleTo(ClusterResources target) {
            return new Advice(Optional.of(target), true, "Scaling due to load changes");
        }

        @Override
        public String toString() {
            return "autoscaling advice: " +
                   (present ? (target.isPresent() ? "Scale to " + target.get() : "Don't scale") : " None");
        }

    }

}