aboutsummaryrefslogtreecommitdiffstats
path: root/clustercontroller-core/src/main/java/com/yahoo/vespa/clustercontroller/core/MetricUpdater.java
blob: 419cb652671e97a56cb608b43dfbbd7876773e90 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.clustercontroller.core;

import com.yahoo.vdslib.state.ClusterState;
import com.yahoo.vdslib.state.Node;
import com.yahoo.vdslib.state.NodeState;
import com.yahoo.vdslib.state.NodeType;
import com.yahoo.vdslib.state.State;
import com.yahoo.vespa.clustercontroller.utils.util.ComponentMetricReporter;
import com.yahoo.vespa.clustercontroller.utils.util.MetricReporter;

import java.time.Duration;
import java.util.HashMap;
import java.util.Map;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.function.BooleanSupplier;

public class MetricUpdater {

    private final ComponentMetricReporter metricReporter;

    public MetricUpdater(MetricReporter metricReporter, int controllerIndex, String clusterName) {
        this.metricReporter = new ComponentMetricReporter(metricReporter, "cluster-controller.");
        this.metricReporter.addDimension("controller-index", String.valueOf(controllerIndex));
        this.metricReporter.addDimension("clusterid", clusterName);
    }

    public MetricReporter.Context createContext(Map<String, String> dimensions) {
        return metricReporter.createContext(dimensions);
    }

    private static int nodesInAvailableState(Map<State, Integer> nodeCounts) {
        return nodeCounts.getOrDefault(State.INITIALIZING, 0)
                + nodeCounts.getOrDefault(State.RETIRED, 0)
                + nodeCounts.getOrDefault(State.UP, 0)
                // Even though technically not true, we here treat Maintenance as an available state to
                // avoid triggering false alerts when a node is taken down transiently in an orchestrated manner.
                + nodeCounts.getOrDefault(State.MAINTENANCE, 0);
    }

    public void updateClusterStateMetrics(ContentCluster cluster, ClusterState state, ResourceUsageStats resourceUsage) {
        Map<String, String> dimensions = new HashMap<>();
        dimensions.put("cluster", cluster.getName());
        dimensions.put("clusterid", cluster.getName());
        for (NodeType type : NodeType.getTypes()) {
            dimensions.put("node-type", type.toString().toLowerCase());
            MetricReporter.Context context = createContext(dimensions);
            Map<State, Integer> nodeCounts = new HashMap<>();
            for (State s : State.values()) {
                nodeCounts.put(s, 0);
            }
            for (Integer i : cluster.getConfiguredNodes().keySet()) {
                NodeState s = state.getNodeState(new Node(type, i));
                Integer count = nodeCounts.get(s.getState());
                nodeCounts.put(s.getState(), count + 1);
            }
            for (State s : State.values()) {
                String name = s.toString().toLowerCase() + ".count";
                metricReporter.set(name, nodeCounts.get(s), context);
            }

            final int availableNodes = nodesInAvailableState(nodeCounts);
            final int totalNodes = Math.max(cluster.getConfiguredNodes().size(), 1); // Assumes 1-1 between distributor and storage
            metricReporter.set("available-nodes.ratio", (double)availableNodes / totalNodes, context);
        }
        dimensions.remove("node-type");
        MetricReporter.Context context = createContext(dimensions);
        metricReporter.add("cluster-state-change", 1, context);

        metricReporter.set("resource_usage.max_disk_utilization", resourceUsage.getMaxDiskUtilization(), context);
        metricReporter.set("resource_usage.max_memory_utilization", resourceUsage.getMaxMemoryUtilization(), context);
        metricReporter.set("resource_usage.nodes_above_limit", resourceUsage.getNodesAboveLimit(), context);
        metricReporter.set("resource_usage.disk_limit", resourceUsage.getDiskLimit(), context);
        metricReporter.set("resource_usage.memory_limit", resourceUsage.getMemoryLimit(), context);
    }

    public void updateMasterElectionMetrics(Map<Integer, Integer> data) {
        Map<Integer, Integer> voteCounts = new HashMap<>();
        for(Integer i : data.values()) {
            int count = (voteCounts.get(i) == null ? 0 : voteCounts.get(i));
            voteCounts.put(i, count + 1);
        }
        SortedSet<Integer> counts = new TreeSet<>(voteCounts.values());
        if (counts.size() > 1 && counts.first() > counts.last()) {
            throw new IllegalStateException("Assumed smallest count is sorted first");
        }
        int maxCount = counts.isEmpty() ? 0 : counts.last();
        metricReporter.set("agreed-master-votes", maxCount);
    }

    public void updateMasterState(boolean isMaster) {
        metricReporter.set("is-master", isMaster ? 1 : 0);
    }

    public void addTickTime(long millis, boolean didWork) {
        if (didWork) {
            metricReporter.set("busy-tick-time-ms", millis);
        } else {
            metricReporter.set("idle-tick-time-ms", millis);
        }
    }

    public void recordNewNodeEvent() {
        // TODO(hakonhall): Replace add() with a persistent aggregate metric.
        metricReporter.add("node-event", 1);
    }

    public void updateRemoteTaskQueueSize(int size) {
        metricReporter.set("remote-task-queue.size", size);
    }

    public boolean forWork(String workId, BooleanSupplier work) {
        long startNanos = System.nanoTime();
        boolean didWork = work.getAsBoolean();
        double seconds = Duration.ofNanos(System.nanoTime() - startNanos).toMillis() / 1000.;

        MetricReporter.Context context = createContext(Map.of("didWork", Boolean.toString(didWork),
                                                              "workId", workId));
        metricReporter.set("work-ms", seconds, context);

        return didWork;
    }
}