node-repository/src/main/java/com/yahoo/vespa/hosted/provision/autoscale/ResourceIterator.java


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152

// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.provision.autoscale;

import com.yahoo.config.provision.ClusterResources;
import com.yahoo.config.provision.NodeResources;
import com.yahoo.vespa.hosted.provision.applications.Cluster;

/**
 * Provides iteration over possible cluster resource allocations given a target total load
 * and current groups/nodes allocation.
 */
public class ResourceIterator {

    // Configured min and max nodes for suggestions for apps which have not activated autoscaling
    private static final int minimumNodes = 3; // Since this is with redundancy it cannot be lower than 2
    private static final int maximumNodes = 150;

    // When a query is issued on a node the cost is the sum of a fixed cost component and a cost component
    // proportional to document count. We must account for this when comparing configurations with more or fewer nodes.
    // TODO: Measure this, and only take it into account with queries
    private static final double fixedCpuCostFraction = 0.1;

    // Prescribed state
    private final Cluster cluster;

    // Observed state
    private final AllocatableClusterResources allocation;
    private final double cpuLoad;
    private final double memoryLoad;
    private final double diskLoad;
    private final int groupSize;

    // Derived from the observed state
    private final int nodeIncrement;
    private final boolean singleGroupMode;

    // Iterator state
    private int currentNodes;

    public ResourceIterator(double cpuLoad, double memoryLoad, double diskLoad,
                            AllocatableClusterResources currentAllocation,
                            Cluster cluster) {
        this.cpuLoad = cpuLoad;
        this.memoryLoad = memoryLoad;
        this.diskLoad = diskLoad;

        // ceil: If the division does not produce a whole number we assume some node is missing
        groupSize = (int)Math.ceil((double)currentAllocation.nodes() / currentAllocation.groups());
        allocation = currentAllocation;

        this.cluster = cluster;

        // What number of nodes is it effective to add or remove at the time from this cluster?
        // This is the group size, since we (for now) assume the group size is decided by someone wiser than us
        // and we decide the number of groups.
        // The exception is when we only have one group, where we can add and remove single nodes in it.
        singleGroupMode = currentAllocation.groups() == 1;
        nodeIncrement = singleGroupMode ? 1 : groupSize;

        // Step down to the right starting point
        currentNodes = currentAllocation.nodes();
        while (currentNodes - nodeIncrement >= minNodes()
               && ( singleGroupMode || currentNodes - nodeIncrement > groupSize)) // group level redundancy
            currentNodes -= nodeIncrement;
    }

    /** If autoscaling is not enabled (meaning max and min resources are the same), we want to suggest */
    private boolean suggestMode() {
        return cluster.minResources().equals(cluster.maxResources());
    }

    public ClusterResources next() {
        ClusterResources next = resourcesWith(currentNodes);
        currentNodes += nodeIncrement;
        return next;
    }

    public boolean hasNext() {
        return currentNodes <= maxNodes();
    }

    private int minNodes() {
        if (suggestMode()) return minimumNodes;
        if (singleGroupMode) return cluster.minResources().nodes();
        return Math.max(cluster.minResources().nodes(), cluster.minResources().groups() * groupSize );
    }

    private int maxNodes() {
        if (suggestMode()) return maximumNodes;
        if (singleGroupMode) return cluster.maxResources().nodes();
        return Math.min(cluster.maxResources().nodes(), cluster.maxResources().groups() * groupSize );
    }

    private ClusterResources resourcesWith(int nodes) {
        int nodesWithRedundancy = nodes - (singleGroupMode ? 1 : groupSize);
        return new ClusterResources(nodes,
                                    singleGroupMode ? 1 : nodes / groupSize,
                                    nodeResourcesWith(nodesWithRedundancy));
    }

    /**
     * For the observed load this instance is initialized with, returns the resources needed per node to be at
     * ideal load given a target node count
     */
    private NodeResources nodeResourcesWith(int nodeCount) {
        // Cpu: Scales with cluster size (TODO: Only reads, writes scales with group size)
        // Memory and disk: Scales with group size

        double cpu, memory, disk;
        if (singleGroupMode) {
            // The fixed cost portion of cpu does not scale with changes to the node count
            // TODO: Only for the portion of cpu consumed by queries
            double totalCpu = clusterUsage(Resource.cpu, cpuLoad);
            cpu = fixedCpuCostFraction       * totalCpu / groupSize / Resource.cpu.idealAverageLoad() +
                  (1 - fixedCpuCostFraction) * totalCpu / nodeCount / Resource.cpu.idealAverageLoad();
            if (allocation.clusterType().isContent()) { // load scales with node share of content
                memory = groupUsage(Resource.memory, memoryLoad) / nodeCount / Resource.memory.idealAverageLoad();
                disk = groupUsage(Resource.disk, diskLoad) / nodeCount / Resource.disk.idealAverageLoad();
            }
            else {
                memory = nodeUsage(Resource.memory, memoryLoad) / Resource.memory.idealAverageLoad();
                disk = nodeUsage(Resource.disk, diskLoad) / Resource.disk.idealAverageLoad();
            }
        }
        else {
            cpu = clusterUsage(Resource.cpu, cpuLoad) / nodeCount / Resource.cpu.idealAverageLoad();
            if (allocation.clusterType().isContent()) { // load scales with node share of content
                memory = groupUsage(Resource.memory, memoryLoad) / groupSize / Resource.memory.idealAverageLoad();
                disk = groupUsage(Resource.disk, diskLoad) / groupSize / Resource.disk.idealAverageLoad();
            }
            else {
                memory = nodeUsage(Resource.memory, memoryLoad) / Resource.memory.idealAverageLoad();
                disk = nodeUsage(Resource.disk, diskLoad) / Resource.disk.idealAverageLoad();
            }
        }

        return allocation.realResources().withVcpu(cpu).withMemoryGb(memory).withDiskGb(disk);
    }

    private double clusterUsage(Resource resource, double load) {
        return nodeUsage(resource, load) * allocation.nodes();
    }

    private double groupUsage(Resource resource, double load) {
        return nodeUsage(resource, load) * groupSize;
    }

    private double nodeUsage(Resource resource, double load) {
        return load * resource.valueFrom(allocation.realResources());
    }

}