1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
|
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.clustercontroller.core;
import com.yahoo.vdslib.state.ClusterState;
import com.yahoo.vdslib.state.Node;
import com.yahoo.vdslib.state.NodeState;
import com.yahoo.vdslib.state.NodeType;
import com.yahoo.vdslib.state.State;
import com.yahoo.vespa.clustercontroller.utils.util.ComponentMetricReporter;
import com.yahoo.vespa.clustercontroller.utils.util.MetricReporter;
import java.time.Duration;
import java.util.HashMap;
import java.util.Map;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.function.BooleanSupplier;
public class MetricUpdater {
private final ComponentMetricReporter metricReporter;
public MetricUpdater(MetricReporter metricReporter, int controllerIndex, String clusterName) {
this.metricReporter = new ComponentMetricReporter(metricReporter, "cluster-controller.");
this.metricReporter.addDimension("controller-index", String.valueOf(controllerIndex));
this.metricReporter.addDimension("clusterid", clusterName);
}
public MetricReporter.Context createContext(Map<String, String> dimensions) {
return metricReporter.createContext(dimensions);
}
private static int nodesInAvailableState(Map<State, Integer> nodeCounts) {
return nodeCounts.getOrDefault(State.INITIALIZING, 0)
+ nodeCounts.getOrDefault(State.RETIRED, 0)
+ nodeCounts.getOrDefault(State.UP, 0)
// Even though technically not true, we here treat Maintenance as an available state to
// avoid triggering false alerts when a node is taken down transiently in an orchestrated manner.
+ nodeCounts.getOrDefault(State.MAINTENANCE, 0);
}
public void updateClusterStateMetrics(ContentCluster cluster, ClusterState state, ResourceUsageStats resourceUsage) {
Map<String, String> dimensions = new HashMap<>();
dimensions.put("cluster", cluster.getName());
dimensions.put("clusterid", cluster.getName());
for (NodeType type : NodeType.getTypes()) {
dimensions.put("node-type", type.toString().toLowerCase());
MetricReporter.Context context = createContext(dimensions);
Map<State, Integer> nodeCounts = new HashMap<>();
for (State s : State.values()) {
nodeCounts.put(s, 0);
}
for (Integer i : cluster.getConfiguredNodes().keySet()) {
NodeState s = state.getNodeState(new Node(type, i));
Integer count = nodeCounts.get(s.getState());
nodeCounts.put(s.getState(), count + 1);
}
for (State s : State.values()) {
String name = s.toString().toLowerCase() + ".count";
metricReporter.set(name, nodeCounts.get(s), context);
}
final int availableNodes = nodesInAvailableState(nodeCounts);
final int totalNodes = Math.max(cluster.getConfiguredNodes().size(), 1); // Assumes 1-1 between distributor and storage
metricReporter.set("available-nodes.ratio", (double)availableNodes / totalNodes, context);
}
dimensions.remove("node-type");
MetricReporter.Context context = createContext(dimensions);
metricReporter.add("cluster-state-change", 1, context);
metricReporter.set("resource_usage.max_disk_utilization", resourceUsage.getMaxDiskUtilization(), context);
metricReporter.set("resource_usage.max_memory_utilization", resourceUsage.getMaxMemoryUtilization(), context);
metricReporter.set("resource_usage.nodes_above_limit", resourceUsage.getNodesAboveLimit(), context);
metricReporter.set("resource_usage.disk_limit", resourceUsage.getDiskLimit(), context);
metricReporter.set("resource_usage.memory_limit", resourceUsage.getMemoryLimit(), context);
}
public void updateMasterElectionMetrics(Map<Integer, Integer> data) {
Map<Integer, Integer> voteCounts = new HashMap<>();
for(Integer i : data.values()) {
int count = (voteCounts.get(i) == null ? 0 : voteCounts.get(i));
voteCounts.put(i, count + 1);
}
SortedSet<Integer> counts = new TreeSet<>(voteCounts.values());
if (counts.size() > 1 && counts.first() > counts.last()) {
throw new IllegalStateException("Assumed smallest count is sorted first");
}
int maxCount = counts.isEmpty() ? 0 : counts.last();
metricReporter.set("agreed-master-votes", maxCount);
}
public void updateMasterState(boolean isMaster) {
metricReporter.set("is-master", isMaster ? 1 : 0);
}
public void addTickTime(long millis, boolean didWork) {
if (didWork) {
metricReporter.set("busy-tick-time-ms", millis);
} else {
metricReporter.set("idle-tick-time-ms", millis);
}
}
public void recordNewNodeEvent() {
// TODO(hakonhall): Replace add() with a persistent aggregate metric.
metricReporter.add("node-event", 1);
}
public void updateRemoteTaskQueueSize(int size) {
metricReporter.set("remote-task-queue.size", size);
}
public boolean forWork(String workId, BooleanSupplier work) {
long startNanos = System.nanoTime();
boolean didWork = work.getAsBoolean();
double seconds = Duration.ofNanos(System.nanoTime() - startNanos).toMillis() / 1000.;
MetricReporter.Context context = createContext(Map.of("didWork", Boolean.toString(didWork),
"workId", workId));
metricReporter.set("work-ms", seconds, context);
return didWork;
}
}
|