aboutsummaryrefslogtreecommitdiffstats
path: root/node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/Autoscaler.java
blob: f1be485d368c796b2e1bf1bdb19cedc1f60de4ce (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.provision.autoscale;

import com.yahoo.config.provision.ApplicationId;
import com.yahoo.config.provision.CloudName;
import com.yahoo.config.provision.ClusterSpec;
import com.yahoo.config.provision.Flavor;
import com.yahoo.config.provision.NodeResources;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.provisioning.HostResourcesCalculator;

import java.time.Duration;
import java.util.List;
import java.util.Optional;

/**
 * The autoscaler makes decisions about the flavor and node count that should be allocated to a cluster
 * based on observed behavior.
 *
 * @author bratseth
 */
public class Autoscaler {

    /*
     TODO:
     - X Don't always go for more, smaller nodes
     - X Test gc
     - X Test AutoscalingMaintainer
     - X Implement node metrics fetch
     - X Avoid making decisions for the same app at multiple config servers
     - Have a better idea about whether we have sufficient information to make decisions
     - Consider taking spikes/variance into account
     - Measure observed regulation lag (startup+redistribution) into account when deciding regulation observation window
     */

    private static final int minimumMeasurements = 500; // TODO: Per node instead? Also say something about interval?

    /** Only change if the difference between the current and best ratio is larger than this */
    private static final double resourceDifferenceRatioWorthReallocation = 0.1;

    // We only depend on the ratios between these values
    private static final double cpuUnitCost = 12.0;
    private static final double memoryUnitCost = 1.2;
    private static final double diskUnitCost = 0.045;

    private final HostResourcesCalculator hostResourcesCalculator;
    private final NodeMetricsDb metricsDb;
    private final NodeRepository nodeRepository;

    public Autoscaler(HostResourcesCalculator hostResourcesCalculator,
                      NodeMetricsDb metricsDb,
                      NodeRepository nodeRepository) {
        this.hostResourcesCalculator = hostResourcesCalculator;
        this.metricsDb = metricsDb;
        this.nodeRepository = nodeRepository;
    }

    public Optional<ClusterResources> autoscale(ApplicationId applicationId, ClusterSpec cluster, List<Node> clusterNodes) {
        if (clusterNodes.stream().anyMatch(node -> node.status().wantToRetire() ||
                                                   node.allocation().get().membership().retired() ||
                                                   node.allocation().get().isRemovable()))
            return Optional.empty(); // Don't autoscale clusters that are in flux
        ClusterResources currentAllocation = new ClusterResources(clusterNodes);
        Optional<Double> totalCpuSpent    = averageUseOf(Resource.cpu,    applicationId, cluster, clusterNodes);
        Optional<Double> totalMemorySpent = averageUseOf(Resource.memory, applicationId, cluster, clusterNodes);
        Optional<Double> totalDiskSpent   = averageUseOf(Resource.disk,   applicationId, cluster, clusterNodes);
        if (totalCpuSpent.isEmpty() || totalMemorySpent.isEmpty() || totalDiskSpent.isEmpty()) return Optional.empty();

        Optional<ClusterResources> bestAllocation = findBestAllocation(totalCpuSpent.get(),
                                                                       totalMemorySpent.get(),
                                                                       totalDiskSpent.get(),
                                                                       currentAllocation);
        if (bestAllocation.isPresent() && isSimilar(bestAllocation.get(), currentAllocation))
            return Optional.empty(); // Avoid small changes
        return bestAllocation;
    }

    private Optional<ClusterResources> findBestAllocation(double totalCpu, double totalMemory, double totalDisk,
                                                          ClusterResources currentAllocation) {
        Optional<ClusterResourcesWithCost> bestAllocation = Optional.empty();
        for (ResourceIterator i = new ResourceIterator(totalCpu, totalMemory, totalDisk, currentAllocation); i.hasNext(); ) {
            ClusterResources allocation = i.next();
            Optional<ClusterResourcesWithCost> allocatableResources = toAllocatableResources(allocation);
            if (allocatableResources.isEmpty()) continue;

            if (bestAllocation.isEmpty() || allocatableResources.get().cost() < bestAllocation.get().cost())
                bestAllocation = allocatableResources;
        }
        return bestAllocation.map(a -> a.clusterResources());
    }

    private boolean isSimilar(ClusterResources a1, ClusterResources a2) {
        if (a1.nodes() != a2.nodes()) return false; // A full node is always a significant difference
        return isSimilar(a1.nodeResources().vcpu(), a2.nodeResources().vcpu()) &&
               isSimilar(a1.nodeResources().memoryGb(), a2.nodeResources().memoryGb()) &&
               isSimilar(a1.nodeResources().diskGb(), a2.nodeResources().diskGb());
    }

    private boolean isSimilar(double r1, double r2) {
        return Math.abs(r1 - r2) / r1 < resourceDifferenceRatioWorthReallocation;
    }

    /**
     * Returns the smallest allocatable node resources larger than the given node resources,
     * or empty if none available.
     */
    private Optional<ClusterResourcesWithCost> toAllocatableResources(ClusterResources resources) {
        if (allowsHostSharing(nodeRepository.zone().cloud())) {
            // Return the requested resources, or empty if they cannot fit on existing hosts
            for (Flavor flavor : nodeRepository.getAvailableFlavors().getFlavors())
                if (flavor.resources().satisfies(resources.nodeResources()))
                    return Optional.of(new ClusterResourcesWithCost(resources,
                                                                    costOf(resources.nodeResources()) * resources.nodes()));
            return Optional.empty();
        }
        else {
            // return the cheapest flavor satisfying the target resources, if any
            double bestCost = Double.MAX_VALUE;
            Optional<Flavor> bestFlavor = Optional.empty();
            for (Flavor flavor : nodeRepository.getAvailableFlavors().getFlavors()) {
                if ( ! flavor.resources().satisfies(resources.nodeResources())) continue;
                if (bestFlavor.isEmpty() || bestCost > costOf(flavor.resources())) {
                    bestFlavor = Optional.of(flavor);
                    bestCost = costOf(flavor);
                }
            }
            if (bestFlavor.isEmpty())
                return Optional.empty();
            else
                return Optional.of(new ClusterResourcesWithCost(resources.with(bestFlavor.get().resources()),
                                                                bestCost * resources.nodes()));
        }
    }

    /**
     * Returns the average total (over all nodes) of this resource in the measurement window,
     * or empty if we are not in a position to take decisions from these measurements at this time.
     */
    private Optional<Double> averageUseOf(Resource resource, ApplicationId applicationId, ClusterSpec cluster, List<Node> clusterNodes) {
        NodeResources currentResources = clusterNodes.get(0).flavor().resources();

        NodeMetricsDb.Window window = metricsDb.getWindow(nodeRepository.clock().instant().minus(scalingWindow(cluster.type())),
                                                          resource,
                                                          clusterNodes);

        if (window.measurementCount() < minimumMeasurements) return Optional.empty();
        if (window.hostnames() != clusterNodes.size()) return Optional.empty(); // Regulate only when all nodes are measured

        return Optional.of(window.average() * resource.valueFrom(currentResources) * clusterNodes.size());
    }

    /** The duration of the window we need to consider to make a scaling decision */
    private Duration scalingWindow(ClusterSpec.Type clusterType) {
        if (clusterType.isContent()) return Duration.ofHours(12); // Ideally we should use observed redistribution time
        return Duration.ofHours(12); // TODO: Measure much more often to get this down to minutes. And, ideally we should take node startup time into account
    }

    // TODO: Put this in zone config instead?
    private boolean allowsHostSharing(CloudName cloudName) {
        if (cloudName.value().equals("aws")) return false;
        return true;
    }

    private double costOf(Flavor flavor) {
        NodeResources chargedResources = hostResourcesCalculator.availableCapacityOf(flavor.name(), flavor.resources());
        return costOf(chargedResources);
    }

    static double costOf(NodeResources resources) {
        return resources.vcpu() * cpuUnitCost +
               resources.memoryGb() * memoryUnitCost +
               resources.diskGb() * diskUnitCost;
    }

}