aboutsummaryrefslogtreecommitdiffstats
path: root/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/ResourceExhaustionCalculator.java
blob: 4bc6cd1fbd278ed96572310a7b3664d317921eda (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.clustercontroller.core;

import com.yahoo.vespa.clustercontroller.core.hostinfo.HostInfo;

import java.util.Collection;
import java.util.Collections;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.stream.Collectors;

/**
 * Given a mapping of (opaque) resource names and their exclusive limits,
 * this class acts as an utility to easily enumerate all the resources that
 * a given node (or set of nodes) have exhausted.
 *
 * In order to support hysteresis, optionally takes in the _current_ feed
 * block state. This lets the calculator make the decision to emit a resource
 * exhaustion for a node that is technically below the feed block limit, as
 * long as it's not yet below the hysteresis threshold.
 */
public class ResourceExhaustionCalculator {

    private final boolean feedBlockEnabled;
    private final Map<String, Double> feedBlockLimits;
    private final double feedBlockNoiseLevel;
    private final Set<NodeAndResourceType> previouslyBlockedNodeResources;

    private static class NodeAndResourceType {
        public final int nodeIndex;
        public final String resourceType;

        public NodeAndResourceType(int nodeIndex, String resourceType) {
            this.nodeIndex = nodeIndex;
            this.resourceType = resourceType;
        }

        public static NodeAndResourceType of(int nodeIndex, String resourceType) {
            return new NodeAndResourceType(nodeIndex, resourceType);
        }

        @Override
        public boolean equals(Object o) {
            if (this == o) return true;
            if (o == null || getClass() != o.getClass()) return false;
            NodeAndResourceType that = (NodeAndResourceType) o;
            return nodeIndex == that.nodeIndex &&
                    Objects.equals(resourceType, that.resourceType);
        }

        @Override
        public int hashCode() {
            return Objects.hash(nodeIndex, resourceType);
        }
    }

    public ResourceExhaustionCalculator(boolean feedBlockEnabled, Map<String, Double> feedBlockLimits) {
        this.feedBlockEnabled = feedBlockEnabled;
        this.feedBlockLimits = feedBlockLimits;
        this.feedBlockNoiseLevel = 0.0;
        this.previouslyBlockedNodeResources = Collections.emptySet();
    }

    public ResourceExhaustionCalculator(boolean feedBlockEnabled, Map<String, Double> feedBlockLimits,
                                        ClusterStateBundle.FeedBlock previousFeedBlock,
                                        double feedBlockNoiseLevel) {
        this.feedBlockEnabled = feedBlockEnabled;
        this.feedBlockLimits = feedBlockLimits;
        this.feedBlockNoiseLevel = feedBlockNoiseLevel;
        if (previousFeedBlock != null) {
            this.previouslyBlockedNodeResources = previousFeedBlock.getConcreteExhaustions().stream()
                    .map(ex -> NodeAndResourceType.of(ex.node.getIndex(), ex.resourceType))
                    .collect(Collectors.toSet());
        } else {
            this.previouslyBlockedNodeResources = Collections.emptySet();
        }
    }

    public static String decoratedMessage(ContentCluster cluster, String msg) {
        // Disambiguate content cluster and add a user-friendly documentation link to the error message
        return "in content cluster '%s': %s. See https://docs.vespa.ai/en/operations/feed-block.html".formatted(cluster.getName(), msg);
    }

    public ClusterStateBundle.FeedBlock inferContentClusterFeedBlockOrNull(ContentCluster cluster) {
        if (!feedBlockEnabled) {
            return null;
        }
        var nodeInfos = cluster.getNodeInfos();
        var exhaustions = enumerateNodeResourceExhaustionsAcrossAllNodes(nodeInfos);
        if (exhaustions.isEmpty()) {
            return null;
        }
        int maxDescriptions = 3;
        String description = exhaustions.stream()
                .limit(maxDescriptions)
                .map(NodeResourceExhaustion::toExhaustionAddedDescription)
                .collect(Collectors.joining(", "));
        if (exhaustions.size() > maxDescriptions) {
            description += String.format(" (... and %d more)", exhaustions.size() - maxDescriptions);
        }
        description = decoratedMessage(cluster, description);
        // FIXME we currently will trigger a cluster state recomputation even if the number of
        // exhaustions is greater than what is returned as part of the description. Though at
        // that point, cluster state recomputations will be the least of your worries...!
        return ClusterStateBundle.FeedBlock.blockedWith(description, exhaustions);
    }

    public Set<NodeResourceExhaustion> resourceExhaustionsFromHostInfo(NodeInfo nodeInfo, HostInfo hostInfo) {
        Set<NodeResourceExhaustion> exceedingLimit = null;
        for (var usage : hostInfo.getContentNode().getResourceUsage().entrySet()) {
            double configuredLimit = feedBlockLimits.getOrDefault(usage.getKey(), 1.0);
            // To enable hysteresis on feed un-block we adjust the effective limit iff the particular
            // <node, resource> tuple was blocked in the previous state.
            boolean wasBlocked = previouslyBlockedNodeResources.contains(NodeAndResourceType.of(nodeInfo.getNodeIndex(), usage.getKey()));
            double effectiveLimit = wasBlocked ? Math.max(configuredLimit - feedBlockNoiseLevel, 0.0)
                                               : configuredLimit;
            if (usage.getValue().getUsage() > effectiveLimit) {
                if (exceedingLimit == null) {
                    exceedingLimit = new LinkedHashSet<>();
                }
                exceedingLimit.add(new NodeResourceExhaustion(nodeInfo.getNode(), usage.getKey(), usage.getValue(),
                                                              effectiveLimit, nodeInfo.getRpcAddress()));
            }
        }
        return (exceedingLimit != null) ? exceedingLimit : Collections.emptySet();
    }

    public Set<NodeResourceExhaustion> enumerateNodeResourceExhaustions(NodeInfo nodeInfo) {
        if (!nodeInfo.isStorage()) {
            return Collections.emptySet();
        }
        return resourceExhaustionsFromHostInfo(nodeInfo, nodeInfo.getHostInfo());
    }

    private static boolean nodeMayContributeToFeedBlocked(NodeInfo info) {
        return (info.getWantedState().getState().oneOf("ur") &&
                info.getReportedState().getState().oneOf("ui"));
    }

    // Returns 0-n entries per content node in the cluster, where n is the number of exhausted
    // resource types on any given node.
    public Set<NodeResourceExhaustion> enumerateNodeResourceExhaustionsAcrossAllNodes(Collection<NodeInfo> nodeInfos) {
        return nodeInfos.stream()
                .filter(info -> nodeMayContributeToFeedBlocked(info))
                .flatMap(info -> enumerateNodeResourceExhaustions(info).stream())
                .collect(Collectors.toCollection(LinkedHashSet::new));
    }

}