summaryrefslogtreecommitdiffstats
path: root/configserver/src/main/java/com/yahoo/vespa/config/server/metrics/ClusterMetricsRetriever.java
blob: b51f3d01e1ad8bd93f965c2f4527a6d6c0ed0f1f (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.config.server.metrics;

import com.yahoo.slime.ArrayTraverser;
import com.yahoo.slime.Inspector;
import com.yahoo.slime.Slime;
import com.yahoo.vespa.config.SlimeUtils;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.HttpClientBuilder;

import java.io.IOException;
import java.io.InputStream;
import java.io.UncheckedIOException;
import java.net.URI;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.TimeUnit;
import java.util.logging.Logger;


/**
 * Client for reaching out to nodes in an application instance and get their
 * metrics.
 *
 * @author olaa
 * @author ogronnesby
 */
public class ClusterMetricsRetriever {

    private static final Logger log = Logger.getLogger(ClusterMetricsRetriever.class.getName());

    private static final String VESPA_CONTAINER = "vespa.container";
    private static final String VESPA_QRSERVER = "vespa.qrserver";
    private static final String VESPA_DISTRIBUTOR = "vespa.distributor";
    private static final List<String> WANTED_METRIC_SERVICES = List.of(VESPA_CONTAINER, VESPA_QRSERVER, VESPA_DISTRIBUTOR);

    /**
     * Call the metrics API on each host and aggregate the metrics
     * into a single value, grouped by cluster.
     */
    public Map<ClusterInfo, MetricsAggregator> requestMetricsGroupedByCluster(Collection<URI> hosts) {
        Map<ClusterInfo, MetricsAggregator> clusterMetricsMap = new ConcurrentHashMap<>();

        Runnable retrieveMetricsJob = () ->
                hosts.parallelStream().forEach(host ->
                    getHostMetrics(host, clusterMetricsMap)
                );

        ForkJoinPool threadPool = new ForkJoinPool(5);
        threadPool.submit(retrieveMetricsJob);
        threadPool.shutdown();

        try {
            threadPool.awaitTermination(1, TimeUnit.MINUTES);
        } catch (InterruptedException e) {
            throw new RuntimeException(e);
        }
        return clusterMetricsMap;
    }

    private static void getHostMetrics(URI hostURI, Map<ClusterInfo, MetricsAggregator> clusterMetricsMap) {
            Slime responseBody = doMetricsRequest(hostURI);
            var parseError = responseBody.get().field("error_message");

            if (parseError.valid()) {
                log.info("Failed to retrieve metrics from " + hostURI + ": " + parseError.asString());
            }

            Inspector services = responseBody.get().field("services");
            services.traverse((ArrayTraverser) (i, servicesInspector) ->
                parseService(servicesInspector, clusterMetricsMap)
            );
    }

    private static Slime doMetricsRequest(URI hostURI) {
        HttpGet get = new HttpGet(hostURI);
        try {
            HttpClient httpClient = HttpClientBuilder.create().build();
            HttpResponse response = httpClient.execute(get);
            InputStream is = response.getEntity().getContent();
            Slime slime = SlimeUtils.jsonToSlime(is.readAllBytes());
            is.close();
            return slime;
        } catch (IOException e) {
            // Usually caused by applications being deleted during metric retrieval
            log.warning("Was unable to fetch metrics from " + hostURI);
            return new Slime();
        }
    }

    private static void parseService(Inspector service, Map<ClusterInfo, MetricsAggregator> clusterMetricsMap) {
        String serviceName = service.field("name").asString();
        service.field("metrics").traverse((ArrayTraverser) (i, metric) ->
                addMetricsToAggeregator(serviceName, metric, clusterMetricsMap)
        );
    }

    private static void addMetricsToAggeregator(String serviceName, Inspector metric, Map<ClusterInfo, MetricsAggregator> clusterMetricsMap) {
        if (!WANTED_METRIC_SERVICES.contains(serviceName)) return;
        Inspector values = metric.field("values");
        ClusterInfo clusterInfo = getClusterInfoFromDimensions(metric.field("dimensions"));
        MetricsAggregator metricsAggregator = clusterMetricsMap.computeIfAbsent(clusterInfo, c -> new MetricsAggregator());

        switch (serviceName) {
            case "vespa.container":
                metricsAggregator.addContainerLatency(
                        values.field("query_latency.sum").asDouble(),
                        values.field("query_latency.count").asDouble());
                metricsAggregator.addFeedLatency(
                        values.field("feed_latency.sum").asDouble(),
                        values.field("feed_latency.count").asDouble());
                break;
            case "vespa.qrserver":
                metricsAggregator.addQrLatency(
                        values.field("query_latency.sum").asDouble(),
                        values.field("query_latency.count").asDouble());
                break;
            case "vespa.distributor":
                metricsAggregator.addDocumentCount(values.field("vds.distributor.docsstored.average").asDouble());
                break;
        }
    }

    private static ClusterInfo getClusterInfoFromDimensions(Inspector dimensions) {
        return new ClusterInfo(dimensions.field("clusterid").asString(), dimensions.field("clustertype").asString());
    }
}