aboutsummaryrefslogtreecommitdiffstats
path: root/node-admin/src/main/java/com/yahoo/vespa/hosted/node/admin/container/ContainerStatsCollector.java
blob: 0e16e2cabf619926d60e33573d9958dfdd63e554 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.node.admin.container;

import com.yahoo.vespa.hosted.node.admin.cgroup.Cgroup;
import com.yahoo.vespa.hosted.node.admin.cgroup.CpuController;
import com.yahoo.vespa.hosted.node.admin.cgroup.Size;
import com.yahoo.vespa.hosted.node.admin.cgroup.MemoryController;
import com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentContext;
import com.yahoo.vespa.hosted.node.admin.task.util.file.UnixUser;

import java.io.IOException;
import java.io.UncheckedIOException;
import java.nio.file.FileSystem;
import java.nio.file.Files;
import java.nio.file.NoSuchFileException;
import java.nio.file.Path;
import java.time.Duration;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.stream.Stream;

/**
 * Collects CPU, GPU, memory and network statistics for a container.
 *
 * Uses same approach as runc: https://github.com/opencontainers/runc/tree/master/libcontainer/cgroups/fs
 *
 * @author mpolden
 */
class ContainerStatsCollector {

    private final ContainerEngine containerEngine;
    private final FileSystem fileSystem;
    private final Cgroup rootCgroup;
    private final int onlineCpus;

    ContainerStatsCollector(ContainerEngine containerEngine, Cgroup rootCgroup, FileSystem fileSystem) {
        this(containerEngine, rootCgroup, fileSystem, Runtime.getRuntime().availableProcessors());
    }

    ContainerStatsCollector(ContainerEngine containerEngine, Cgroup rootCgroup, FileSystem fileSystem, int onlineCpus) {
        this.containerEngine = Objects.requireNonNull(containerEngine);
        this.fileSystem = Objects.requireNonNull(fileSystem);
        this.rootCgroup = Objects.requireNonNull(rootCgroup);
        this.onlineCpus = onlineCpus;
    }

    /** Collect statistics for given container ID and PID */
    public Optional<ContainerStats> collect(NodeAgentContext context, ContainerId containerId, int pid, String iface) {
        try {
            ContainerStats.CpuStats cpuStats = collectCpuStats(containerId);
            ContainerStats.MemoryStats memoryStats = collectMemoryStats(containerId);
            Map<String, ContainerStats.NetworkStats> networkStats = Map.of(iface, collectNetworkStats(iface, pid));
            List<ContainerStats.GpuStats> gpuStats = collectGpuStats(context);
            return Optional.of(new ContainerStats(networkStats, memoryStats, cpuStats, gpuStats));
        } catch (NoSuchFileException ignored) {
            return Optional.empty(); // Container disappeared while we collected stats
        } catch (UncheckedIOException e) {
            if (e.getCause() != null && e.getCause() instanceof  NoSuchFileException)
                return Optional.empty();
            throw e;
        } catch (IOException e) {
            throw new UncheckedIOException(e);
        }
    }

    private List<ContainerStats.GpuStats> collectGpuStats(NodeAgentContext context) {
        boolean hasGpu = Files.exists(fileSystem.getPath("/dev/nvidia0"));
        if (!hasGpu) {
            return List.of();
        }
        Stream<String> lines = containerEngine.execute(context, UnixUser.ROOT, Duration.ofSeconds(5),
                                                       "nvidia-smi",
                                                       "--query-gpu=index,utilization.gpu,memory.total,memory.free",
                                                       "--format=csv,noheader,nounits")
                                              .getOutputLinesStream();
        return lines.map(ContainerStatsCollector::parseGpuStats).toList();
    }

    private static ContainerStats.GpuStats parseGpuStats(String s) {
        String[] fields = fields(s, ",\\s*");
        if (fields.length < 4) throw new IllegalArgumentException("Could not parse GPU stats from '" + s + "'");
        int deviceNumber = Integer.parseInt(fields[0]);
        int loadPercentage = Integer.parseInt(fields[1]);
        long mega = 2 << 19;
        long memoryTotalBytes = Long.parseLong(fields[2]) * mega;
        long memoryFreeBytes = Long.parseLong(fields[3]) * mega;
        long memoryUsedBytes = memoryTotalBytes - memoryFreeBytes;
        return new ContainerStats.GpuStats(deviceNumber, loadPercentage, memoryTotalBytes, memoryUsedBytes);
    }

    private ContainerStats.CpuStats collectCpuStats(ContainerId containerId) throws IOException {
        Map<CpuController.StatField, Long> cpuStats = rootCgroup.resolveContainer(containerId).cpu().readStats();
        return new ContainerStats.CpuStats(onlineCpus,
                                           systemCpuUsage(),
                                           cpuStats.get(CpuController.StatField.TOTAL_USAGE_USEC),
                                           cpuStats.get(CpuController.StatField.SYSTEM_USAGE_USEC),
                                           cpuStats.get(CpuController.StatField.THROTTLED_TIME_USEC),
                                           cpuStats.get(CpuController.StatField.TOTAL_PERIODS),
                                           cpuStats.get(CpuController.StatField.THROTTLED_PERIODS));
    }

    private ContainerStats.MemoryStats collectMemoryStats(ContainerId containerId) throws IOException {
        MemoryController memoryController = rootCgroup.resolveContainer(containerId).memory();
        Size max = memoryController.readMax();
        long memoryUsageInBytes = memoryController.readCurrent().value();
        var stats = memoryController.readStat();
        return new ContainerStats.MemoryStats(
                stats.file().value(), memoryUsageInBytes, max.isMax() ? -1 : max.value(),
                stats.sock().value(), stats.slab().value(), stats.slabReclaimable().value(), stats.anon().value());
    }

    private ContainerStats.NetworkStats collectNetworkStats(String iface, int containerPid) throws IOException {
        for (var line : Files.readAllLines(netDevPath(containerPid))) {
            String[] fields = fields(line.trim());
            if (fields.length < 17 || !fields[0].equals(iface + ":")) continue;

            long rxBytes = Long.parseLong(fields[1]);
            long rxErrors = Long.parseLong(fields[3]);
            long rxDropped = Long.parseLong(fields[4]);

            long txBytes = Long.parseLong(fields[9]);
            long txErrors = Long.parseLong(fields[11]);
            long txDropped = Long.parseLong(fields[12]);

            return new ContainerStats.NetworkStats(rxBytes, rxDropped, rxErrors, txBytes, txDropped, txErrors);
        }
        throw new IllegalArgumentException("No statistics found for interface " + iface);
    }

    /** Returns total CPU time in µs spent executing all the processes on this host */
    private long systemCpuUsage() throws IOException {
        long ticks = parseLong(Files.readAllLines(fileSystem.getPath("/proc/stat")), "cpu");
        return userHzToMicroSeconds(ticks);
    }

    private long parseLong(List<String> lines, String fieldName) {
        long value = 0;
        for (var line : lines) {
            String[] fields = fields(line);
            if (fields.length < 2 || !fields[0].equals(fieldName)) continue;
            for (int i = 1; i < fields.length; i++) {
                value += Long.parseLong(fields[i]);
            }
            break;
        }
        return value;
    }

    private Path netDevPath(int containerPid) {
        return fileSystem.getPath("/proc/" + containerPid + "/net/dev");
    }

    static long userHzToMicroSeconds(long ticks) {
        // Ideally we would read this from _SC_CLK_TCK, but then we need JNI. However, in practice this is always 100 on x86 Linux
        return ticks * 10_000;
    }

    private static String[] fields(String s) {
        return fields(s, "\\s+");
    }

    private static String[] fields(String s, String regex) {
        return s.trim().split(regex);
    }

}